Differential D40850 Diff 124405 sys/kern/vfs_lookup.c

Changeset View

Standalone View

sys/kern/vfs_lookup.c

Show First 20 Lines • Show All 96 Lines • ▼ Show 20 Lines
SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode ", "char ",		SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode ", "char ",
"unsigned long", "bool");		"unsigned long", "bool");
SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool",		SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool",
"struct nameidata");		"struct nameidata");

/* Allocation zone for namei. */		/* Allocation zone for namei. */
uma_zone_t namei_zone;		uma_zone_t namei_zone;

		/* Forward declaration (for use in vn_cross_mounts()). */
		static int enforce_lkflags(struct mount *mp, int lkflags);


/* Placeholder vnode for mp traversal. */		/* Placeholder vnode for mp traversal. */
static struct vnode *vp_crossmp;		static struct vnode *vp_crossmp;

static int		static int
crossmp_vop_islocked(struct vop_islocked_args *ap)		crossmp_vop_islocked(struct vop_islocked_args *ap)
{		{

return (LK_SHARED);		return (LK_SHARED);
Show All 38 Lines	static struct vop_vector crossmp_vnodeops = {
.vop_islocked = crossmp_vop_islocked,		.vop_islocked = crossmp_vop_islocked,
.vop_lock1 = crossmp_vop_lock1,		.vop_lock1 = crossmp_vop_lock1,
.vop_unlock = crossmp_vop_unlock,		.vop_unlock = crossmp_vop_unlock,
};		};
/*		/*
* VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode		* VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
* gets allocated early. See nameiinit for the direct call below.		* gets allocated early. See nameiinit for the direct call below.
*/		*/

		/*
		* Returns busied the mount point mounted on the passed vnode, if any.
		*
		* The vnode's lock must be held and may be released on output, as indicated by
		* '*unlocked'. The caller must also have an active reference on the vnode
		* (vref() or vget()) which is preserved across the call. On success, the
		* busied mount point is passed through 'mp'.
		*
		* If the vnode is not mounted-on, EJUSTRETURN is returned and '*mp' is set to
		* NULL. Concurrent unmounts/remounts of the covering mount are handled
		* transparently by restarting the process (doing so is currently not really
		* necessary for correctness but is closer to the historical behavior where the
		* unmounts/remounts were prevented to happen in this case, and will be required
		* (but not enough) if we ever want to implement such things as atomic mount
		* substitutions). ENOENT is returned if the vnode was doomed while trying to
		* determine its covering mount, and 'mp' is set to NULL. Else, 'mp' is set
		* to the busied mount point and 0 is returned.
		*/
		int
		vn_busy_mountedhere(struct vnode vp, bool unlocked, struct mount **mp)
		{
		int error;

		ASSERT_VOP_LOCKED(vp, __func__);
		ASSERT_VI_UNLOCKED(vp, __func__);

		*unlocked = false;
		*mp = NULL;

		if (VN_IS_DOOMED(vp))
		return (ENOENT);

		jahUnsubmitted Done Inline Actions Why do you need to check VN_IS_DOOMED() here? A non-doomed vnode should be either a precondition of the lookup's initial call to this function, or of the prior iteration's call to VFS_ROOT() with the mount busied. jah: Why do you need to check VN_IS_DOOMED() here? A non-doomed vnode should be either a…
		olceAuthorUnsubmitted Done Inline Actions Saying this you're assuming that `vn_busy_mountedhere()` is called in fine by `vn_cross_mounts()`, but it's not always the case as you can see in subsequent differentials. In standalone uses, this check is necessary. See next comment as well. olce: Saying this you're assuming that `vn_busy_mountedhere()` is called in fine by `vn_cross_mounts…
		if (__predict_true((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0))
		return (EJUSTRETURN);

		jahUnsubmitted Done Inline Actions Why check VIRF_MOUNTPOINT here? We shouldn't enter this path at all when not dealing with a mount point, which is the common case. jah: Why check VIRF_MOUNTPOINT here? We shouldn't enter this path at all when not dealing with a…
		olceAuthorUnsubmitted Done Inline Actions That's the contract of this function, which has been carefully chosen to simplify callers, some of which need the check. And, conceptually, it doesn't matter if the check is done by the caller or the callee, it has to be done anyway. I'm highly skeptical that the two tests above can have any noticeable performance impact even in the full lookup case (calls initiated by `vn_cross_mounts()`) where they are indeed not necessary. With the proper annotations (I can add `__predict_true()` around that last check), both tests are predicted correctly, implying we are talking about a few cycles at most (on x86 at least). If really necessary, I can always split the function even more. What may matter is avoiding function calls in the common path (I've read some past comments from @mjg hinting at that in different contexts). This is not difficult to arrange by providing a fast-path inline function that tests whether we are indeed at a mounted vnode, although it will increase complexity a bit. I guess that the question is rather whether it is worth it, which I again have doubts on given the context (reminded in the added global comment on this differential). At the same time, clearing these doubts may very well require implementing and benchmarking it anyway. Thus, perhaps the most pressing issue is now to find good benchmarks (with at least one for the non-contended case and another for the contended one) in the hope we can see clear differences. In my tests so far (see the global comment), I do not see any significant difference for the first proposed round of changes alone, but they are no more than mere hints. olce: That's the contract of this function, which has been carefully chosen to simplify callers, some…
		*mp = vp->v_mountedhere;
		MPASS(*mp != NULL);

		/*
		* Opportunistically try to busy the mount point. On success, this can
		* avoid a superfluous unlock/relock cycle on 'vp' in some cases (in
		* particular, the vfs_lookup() case), and always avoids a pair of
		jahUnsubmitted Done Inline Actions Why did you choose to take this approach instead of first trying vfs_busy(..., MBF_NOWAIT) as in the initial patch? This would seem to add a fair amount of lock twiddling and refcounting to the common vfs_lookup() path, which is undesirable in terms of performance. jah: Why did you choose to take this approach instead of first trying vfs_busy(..., MBF_NOWAIT) as…
		olceAuthorUnsubmitted Done Inline Actions It might seem that foregoing `vfs_busy(..., MBF_NOWAIT)` causes more locking but in fact it's not the case (and even the opposite in the contention case). In the end, the covered vnode's lock has to be released anyway (for a traversal that goes well, i.e., no concurrent mounts/unmounts, but in some other cases as well), so trying to optimize acquisition of the mount point through `vfs_busy(..., MBF_NOWAIT)` for the sole purpose of not releasing the vnode lock while it is anyway due to be unlocked just after didn't seem to make sense to me. Actually, the current code in https://reviews.freebsd.org/D40600 causes even more lock acquisitions (and double calls to `vfs_busy()`) in the case with contention on the mount point, e.g., if `vfs_busy(..., MBF_NOWAIT)` fails, since the vnode is first unlocked to call `vfs_busy(..., 0)`, then relocked just to check whether `dp->v_mountedhere == mp` and then unlocked again to continue with the lookup. Not that this case matter that much, but anyway... The recheck of `mp` is never necessary in the approach presented here thanks to the initial `vfs_ref()` (stemming also from my reluctance to rely on `struct mount` objects never been freed). More generally, I find this approach simpler to reason about (in part also because it has been split in a few functions). `vfs_ref()`, as well as `vfs_busy()`, execute with no locks at all in the common case. Their cost is not zero but I'm willing to bet it is insignificant compared to vnode lock acquisitions. This gives me an idea. Re-introducing `vfs_busy(..., MBF_NOWAIT)` could still have an advantage, that is, foregoing the need for vfs_ref() at all if the call works. It would be, in a sense, a fusion of both approaches. I'll try that and report tomorrow. olce: It might seem that foregoing `vfs_busy(..., MBF_NOWAIT)` causes more locking but in fact it's…
		jahUnsubmitted Done Inline Actions I think it's definitely worthwhile to use vfs_busy(..., MBF_NOWAIT) to remove vfs_ref()/vfs_rel() from the common path. As you mention, we don't really care about performance for the non-common contended path. But even for the contended case, I'll note that dropping the vnode lock early requires you to: 1) acquire the vnode interlock, which partially negates any advantage from not needing to re-acquire the vnode lock(), and 2) requires all the 'unlocked' logic which adds complexity to vn_cross_mounts(). jah: I think it's definitely worthwhile to use vfs_busy(..., MBF_NOWAIT) to remove vfs_ref()/vfs_rel…
		olceAuthorUnsubmitted Done Inline Actions Contended case. acquire the vnode interlock, which partially negates any advantage from not needing to re-acquire the vnode lock() Yes, but the key point here may well be "partially". Contending on a non-sleepable lock normally has all chances to be more scalable then doing so on a sleepable one (the only exception here is if everybody almost always lock the vnode shared, while the interlock is a simple mutex). Plus, there is no need to wonder about which lkflags to use with the vnode lock, another advantage with standalone calls where we simply don't (want to) bother with that. requires all the 'unlocked' logic which adds complexity to vn_cross_mounts(). The `unlocked` logic is here to optimize locking by avoiding unnecessary unlock/relock cycles on vnode locks as much as possible. I consider it a first order optimization for the contended case, and probably for the non-contended one as well. Splitting the cross mount functionality in a few functions indeed may have made it more desirable in terms of performance. However, this has to be put in balance with code duplication reduction and the alternative is to use a constellation of functions meant to directly optimize all paths (with peephole for locks), which I suspect will lead to code of comparable, if not greater, complexity, albeit of a different form. I certainly could try it but I'm wondering if it's worth the trouble. olce: Contended case. > 1) acquire the vnode interlock, which partially negates any advantage from…
		olceAuthorUnsubmitted Done Inline Actions I think it's definitely worthwhile to use vfs_busy(..., MBF_NOWAIT) to remove vfs_ref()/vfs_rel() from the common path. I doubt `vfs_ref()` and `vfs_rel()` cost that much, but this is easy to do, so I'll do it. I suspect the biggest gain may rather come from the fact that `unlocked` will stay to true, avoiding a superfluous unlock/relock cycle. Doing so also has some benefit in terms of semantics (see the added global comment). Please see also the test report (global comment on this differential). I'll report new test results with this change, with a bit of lag (I won't be able to work on it tomorrow). olce: > I think it's definitely worthwhile to use vfs_busy(..., MBF_NOWAIT) to remove vfs_ref…
		* vfs_ref()/vfs_rel() calls.
		*/
		error = vfs_busy(*mp, MBF_NOWAIT);
		if (__predict_true(error == 0))
		return (error);

		/* Make sure 'mp' survives the unlock of 'vp'. /
		vfs_ref(*mp);
		VOP_UNLOCK(vp);
		*unlocked = true;

		for (;;) {
		error = vfs_busy(*mp, 0);
		vfs_rel(*mp);

		if (__predict_true(error == 0))
		return (0);

		*mp = NULL;
		VI_LOCK(vp);

		if (VN_IS_DOOMED(vp)) {
		error = ENOENT;
		goto unlock_exit;
		}

		if (__predict_true
		((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0)) {
		error = EJUSTRETURN;
		goto unlock_exit;
		}
		/*
		* We can't retrieve the same (conceptual) mount point as before
		* since the vfs_busy() above returned with an error only after
		* 'v_mountedhere' was cleared on the covered vnode (but we well
		* could retrieve the same pointer in case the structure is
		* recycled).
		*/
		*mp = vp->v_mountedhere;
		MPASS(*mp != NULL);

		/*
		* This establishes the order "covered vnode's interlock" ->
		* "mounted-here mount point interlock". Note that this order
		* between a vnode and a mount point is the reverse of that of
		* "vnode's owning mount point" -> "vnode's interlock", without
		* causing trouble since the mount point is different in both
		* cases. This causes a spurious LOR with the initial 'devfs'
		* being mounted at '/' and then remounted at '/dev' (see
		* vfs_mountroot()).
		*/
		vfs_ref(*mp);
		VI_UNLOCK(vp);
		}

		__assert_unreachable();

		unlock_exit:
		VI_UNLOCK(vp);
		return (error);
		}

		/*
		* Cross a single mounted-on vnode, returning the mount's root vnode.
		*
		* The vnode's lock must be held, and may be unlocked on output as indicated by
		* '*unlocked' or on success. The caller must also have an active reference on
		* the vnode (vref() or vget()) which is preserved across the call. On success,
		* the mount's root vnode is returned locked according to 'root_lkflags' and
		* with an active reference.
		*
		* Behaves essentially for errors and outputs as vn_busy_mountedhere(), which it
		* calls first, with 'vpp' taking the role of 'mp'. In case of success for
		* this step, VFS_ROOT() is called and its result returned. In case of any
		* error, 'vpp' is set to NULL. On overall success, 'unlocked' is guaranteed
		* to be set to true.
		*/
		int
		vn_cross_single_mount(struct vnode* vp, int root_lkflags,
		bool unlocked, struct vnode *vpp)
		{
		struct mount *mp;
		int error;

		*vpp = NULL;

		error = vn_busy_mountedhere(vp, unlocked, &mp);
		if (__predict_false(error == 0)) {
		if (__predict_true(!*unlocked)) {
		VOP_UNLOCK(vp);
		*unlocked = true;
		}
		error = VFS_ROOT(mp, root_lkflags, vpp);
		vfs_unbusy(mp);
		}

		return (error);
		}

		static void
		vn_lock_enforced_flags(struct vnode *vp, int lkflags)
		{
		int error __unused;

		error = vn_lock(vp, enforce_lkflags(vp->v_mount, lkflags \| LK_RETRY));
		KASSERT(error == 0,
		("%s: vn_lock(LK_RETRY) returned %d", __func__, error));
		}

		/*
		* Repeatedly cross mounts starting from a given vnode.
		*
		* Traverses all successive mounts on the same path, locking the successive
		* vnodes as specified by enforce_lkflags() and unlocking them after obtaining
		* their covering mount. Ensures the final vnode is locked and actively
		* referenced. The initial vnode is returned unlocked and its active reference
		* is released except if it is also the final vnode (no mount points to cross).
		*
		* Mounts are crossed until reaching vnode that is not covered by a mount, which
		* is returned locked. If some traversed vnode happens to be doomed, ENOENT is
		* returned. Can return errors reported by VFS_ROOT(). On success, puts the
		* final vnode into '*vpp' and returns 0.
		*
		* This function ensures that the crossed mountpoint cannot be busied and the
		* initial vnode locked at the same time. The goal is to avoid establishing
		* a lock order between them in order to avoid deadlocks, at lookup with mounted
		* stacked filesystems (nullfs, unionfs) where locking the mountpoint's root
		* vnode leads to locking the covered vnode as well and vice-versa, but also at
		* unmount where parallel vfs_busy() calls block while acquiring the covered
		* vnode's lock, which establishes the acquisition order mount point -> covered
		* vnode. This function (through the VFS_ROOT() call) only establishes the
		* acquisition order mount point -> root vnode, which implies mount point ->
		* covered vnode for stacked filesystems, thus the same order as that of
		* dounmount(). In other words, the legal order is that a mount point reference
		* must always be acquired before the vnode's lock, be it the root vnode under
		* the mount point or the covered vnode over it.
		*/
		int
		vn_cross_mounts(struct vnode* vp, int const lkflags, struct vnode ** const vpp)
		{
		int error;
		bool unlocked;

		for (;;) {
		error = vn_cross_single_mount(vp, lkflags, &unlocked, vpp);

		/* Optimize for the non-mount-point case. */
		if (__predict_true(error == EJUSTRETURN)) {
		/* No more mounts to cross. */
		*vpp = vp;
		error = 0;

		if (__predict_false(unlocked)) {
		vn_lock_enforced_flags(vp, lkflags);
		if (VN_IS_DOOMED(vp)) {
		vput(vp);

		*vpp = NULL;
		error = ENOENT;
		}
		}

		return (error);
		}

		if (__predict_false(error != 0)) {
		if (__predict_true(unlocked))
		vrele(vp);
		else
		vput(vp);

		return (error);
		}

		/* Crossed one mount. Try to cross another one. */
		MPASS(unlocked);
		ASSERT_VOP_UNLOCKED(vp, __func__);
		vrele(vp);
		vp = *vpp;
		ASSERT_VOP_LOCKED(vp, __func__);
		}

		__assert_unreachable();
		}

struct nameicap_tracker {		struct nameicap_tracker {
struct vnode *dp;		struct vnode *dp;
TAILQ_ENTRY(nameicap_tracker) nm_link;		TAILQ_ENTRY(nameicap_tracker) nm_link;
};		};

/* Zone for cap mode tracker elements used for dotdot capability checks. */		/* Zone for cap mode tracker elements used for dotdot capability checks. */
MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot");		MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot");
▲ Show 20 Lines • Show All 1,458 Lines • Show Last 20 Lines