Changeset View
Standalone View
sys/kern/vfs_cache.c
Show First 20 Lines • Show All 49 Lines • ▼ Show 20 Lines | |||||
#include <sys/ktr.h> | #include <sys/ktr.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/fcntl.h> | #include <sys/fcntl.h> | ||||
#include <sys/mount.h> | #include <sys/mount.h> | ||||
#include <sys/namei.h> | #include <sys/namei.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/seqc.h> | |||||
#include <sys/sdt.h> | #include <sys/sdt.h> | ||||
#include <sys/smr.h> | #include <sys/smr.h> | ||||
#include <sys/smp.h> | #include <sys/smp.h> | ||||
#include <sys/syscallsubr.h> | #include <sys/syscallsubr.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/sysproto.h> | #include <sys/sysproto.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <ck_queue.h> | #include <ck_queue.h> | ||||
#ifdef KTRACE | #ifdef KTRACE | ||||
#include <sys/ktrace.h> | #include <sys/ktrace.h> | ||||
#endif | #endif | ||||
#include <sys/capsicum.h> | |||||
#include <security/audit/audit.h> | |||||
#include <security/mac/mac_framework.h> | |||||
#ifdef DDB | #ifdef DDB | ||||
#include <ddb/ddb.h> | #include <ddb/ddb.h> | ||||
#endif | #endif | ||||
#include <vm/uma.h> | #include <vm/uma.h> | ||||
SDT_PROVIDER_DECLARE(vfs); | SDT_PROVIDER_DECLARE(vfs); | ||||
SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", | SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", | ||||
Show All 17 Lines | |||||
SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); | SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); | ||||
SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", | SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", | ||||
"struct vnode *"); | "struct vnode *"); | ||||
SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", | SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", | ||||
"char *"); | "char *"); | ||||
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", | SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", | ||||
"char *"); | "char *"); | ||||
SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); | |||||
SDT_PROBE_DECLARE(vfs, namei, lookup, entry); | |||||
SDT_PROBE_DECLARE(vfs, namei, lookup, return); | |||||
/* | /* | ||||
* This structure describes the elements in the cache of recent | * This structure describes the elements in the cache of recent | ||||
* names looked up by namei. | * names looked up by namei. | ||||
*/ | */ | ||||
struct namecache { | struct namecache { | ||||
CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ | CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ | ||||
LIST_ENTRY(namecache) nc_src; /* source vnode list */ | LIST_ENTRY(namecache) nc_src; /* source vnode list */ | ||||
▲ Show 20 Lines • Show All 2,694 Lines • ▼ Show 20 Lines | if (!have_addr) { | ||||
return; | return; | ||||
} | } | ||||
vp = (struct vnode *)addr; | vp = (struct vnode *)addr; | ||||
db_print_vpath(vp); | db_print_vpath(vp); | ||||
} | } | ||||
#endif | #endif | ||||
extern uma_zone_t namei_zone; | |||||
static bool __read_frequently cache_fast_lookup = true; | |||||
SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, | |||||
&cache_fast_lookup, 0, ""); | |||||
#define CACHE_FPL_FAILED -2020 | |||||
static void | |||||
cache_fpl_cleanup_cnp(struct componentname *cnp) | |||||
kib: So this is exact copy on namei_cleanup_cnp. Why ? | |||||
Done Inline ActionsSee below. mjg: See below. | |||||
{ | |||||
uma_zfree(namei_zone, cnp->cn_pnbuf); | |||||
#ifdef DIAGNOSTIC | |||||
cnp->cn_pnbuf = NULL; | |||||
cnp->cn_nameptr = NULL; | |||||
#endif | |||||
} | |||||
static void | |||||
cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) | |||||
Not Done Inline ActionsAnd this is namei_handle_root(), but different for your use case by the lack of vrefact(). Can you reuse the code ? kib: And this is namei_handle_root(), but different for your use case by the lack of vrefact(). Can… | |||||
Done Inline ActionsThis does not do strictrelative nor beneath stuff either. Like path parsing below, this is a candidate to deduplicate after more features show up. I don't see much point trying to dedup the current state. In particular it is unclear to me how deduping of ".." tracking for beneath/capability lookups is going to work out. mjg: This does not do strictrelative nor beneath stuff either. Like path parsing below, this is a… | |||||
{ | |||||
struct componentname *cnp; | |||||
cnp = &ndp->ni_cnd; | |||||
while (*(cnp->cn_nameptr) == '/') { | |||||
cnp->cn_nameptr++; | |||||
ndp->ni_pathlen--; | |||||
} | |||||
*dpp = ndp->ni_rootdir; | |||||
} | |||||
/* | |||||
* Components of nameidata (or objects it can point to) which may | |||||
* need restoring in case fast path lookup fails. | |||||
*/ | |||||
struct nameidata_saved { | |||||
Not Done Inline ActionsWhy creating yet another structure, instead of saving all nameidata ? kib: Why creating yet another structure, instead of saving all nameidata ? | |||||
Done Inline ActionsThe struct is enormous (232 bytes) and has to be checkpointed for every path component in case we have to fallback to regular lookup. mjg: The struct is enormous (232 bytes) and has to be checkpointed for every path component in case… | |||||
int cn_flags; | |||||
long cn_namelen; | |||||
char *cn_nameptr; | |||||
size_t ni_pathlen; | |||||
}; | |||||
struct cache_fpl { | |||||
int line; | |||||
enum cache_fpl_status status; | |||||
bool in_smr; | |||||
struct nameidata *ndp; | |||||
struct nameidata_saved snd; | |||||
struct componentname *cnp; | |||||
struct vnode *dvp; | |||||
seqc_t dvp_seqc; | |||||
struct vnode *tvp; | |||||
seqc_t tvp_seqc; | |||||
struct pwd *pwd; | |||||
}; | |||||
static void | |||||
cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) | |||||
{ | |||||
snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; | |||||
snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; | |||||
snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; | |||||
snd->ni_pathlen = fpl->ndp->ni_pathlen; | |||||
} | |||||
static void | |||||
cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) | |||||
{ | |||||
fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; | |||||
fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; | |||||
fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; | |||||
fpl->ndp->ni_pathlen = snd->ni_pathlen; | |||||
} | |||||
#ifdef INVARIANTS | |||||
#define cache_fpl_smr_assert_entered(fpl) ({ \ | |||||
struct cache_fpl *_fpl = (fpl); \ | |||||
MPASS(_fpl->in_smr == true); \ | |||||
VFS_SMR_ASSERT_ENTERED(); \ | |||||
}) | |||||
#define cache_fpl_smr_assert_not_entered(fpl) ({ \ | |||||
struct cache_fpl *_fpl = (fpl); \ | |||||
MPASS(_fpl->in_smr == false); \ | |||||
VFS_SMR_ASSERT_NOT_ENTERED(); \ | |||||
}) | |||||
#else | |||||
#define cache_fpl_smr_assert_entered(fpl) do { } while (0) | |||||
#define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) | |||||
#endif | |||||
#define cache_fpl_smr_enter(fpl) ({ \ | |||||
struct cache_fpl *_fpl = (fpl); \ | |||||
MPASS(_fpl->in_smr == false); \ | |||||
vfs_smr_enter(); \ | |||||
_fpl->in_smr = true; \ | |||||
}) | |||||
#define cache_fpl_smr_exit(fpl) ({ \ | |||||
struct cache_fpl *_fpl = (fpl); \ | |||||
MPASS(_fpl->in_smr == true); \ | |||||
vfs_smr_exit(); \ | |||||
_fpl->in_smr = false; \ | |||||
}) | |||||
static int | |||||
cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) | |||||
{ | |||||
if (fpl->status != CACHE_FPL_STATUS_UNSET) { | |||||
KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, | |||||
("%s: converting to abort from %d at %d, set at %d\n", | |||||
__func__, fpl->status, line, fpl->line)); | |||||
} | |||||
fpl->status = CACHE_FPL_STATUS_ABORTED; | |||||
fpl->line = line; | |||||
return (CACHE_FPL_FAILED); | |||||
} | |||||
#define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) | |||||
static int | |||||
cache_fpl_partial_impl(struct cache_fpl *fpl, int line) | |||||
{ | |||||
KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, | |||||
("%s: setting to partial at %d, but already set to %d at %d\n", | |||||
__func__, line, fpl->status, fpl->line)); | |||||
cache_fpl_smr_assert_entered(fpl); | |||||
fpl->status = CACHE_FPL_STATUS_PARTIAL; | |||||
fpl->line = line; | |||||
return (CACHE_FPL_FAILED); | |||||
} | |||||
#define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) | |||||
static int | |||||
cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) | |||||
{ | |||||
KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, | |||||
("%s: setting to handled at %d, but already set to %d at %d\n", | |||||
__func__, line, fpl->status, fpl->line)); | |||||
cache_fpl_smr_assert_not_entered(fpl); | |||||
MPASS(error != CACHE_FPL_FAILED); | |||||
fpl->status = CACHE_FPL_STATUS_HANDLED; | |||||
fpl->line = line; | |||||
return (error); | |||||
} | |||||
#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) | |||||
#define CACHE_FPL_SUPPORTED_CN_FLAGS \ | |||||
(LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1) | |||||
static bool | |||||
cache_can_fplookup(struct cache_fpl *fpl) | |||||
{ | |||||
struct nameidata *ndp; | |||||
struct componentname *cnp; | |||||
struct thread *td; | |||||
ndp = fpl->ndp; | |||||
cnp = fpl->cnp; | |||||
td = cnp->cn_thread; | |||||
if (!cache_fast_lookup) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
#ifdef MAC | |||||
if (mac_vnode_check_lookup_enabled()) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
#endif | |||||
if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { | |||||
Done Inline Actions!= 0 kib: != 0 | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if ((cnp->cn_flags & LOCKLEAF) == 0) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if (cnp->cn_nameiop != LOOKUP) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if (ndp->ni_dirfd != AT_FDCWD) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if (IN_CAPABILITY_MODE(td)) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if (AUDITING_TD(td)) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
if (ndp->ni_startdir != NULL) { | |||||
cache_fpl_aborted(fpl); | |||||
return (false); | |||||
} | |||||
return (true); | |||||
} | |||||
static bool | |||||
cache_fplookup_vnode_supported(struct vnode *vp) | |||||
{ | |||||
return (vp->v_type != VLNK); | |||||
} | |||||
/* | |||||
* The target vnode is not supported, prepare for the slow path to take over. | |||||
*/ | |||||
static int | |||||
Done Inline ActionsCan you just write return (vp->v_type == VLNK); ? At least for now. kib: Can you just write `return (vp->v_type == VLNK);` ? At least for now. | |||||
cache_fplookup_partial_setup(struct cache_fpl *fpl) | |||||
{ | |||||
struct componentname *cnp; | |||||
struct vnode *dvp; | |||||
struct pwd *pwd; | |||||
seqc_t dvp_seqc; | |||||
cnp = fpl->cnp; | |||||
dvp = fpl->dvp; | |||||
dvp_seqc = fpl->dvp_seqc; | |||||
if (!vref_smr(dvp)) { | |||||
cache_fpl_smr_exit(fpl); | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
cache_fpl_smr_exit(fpl); | |||||
if (!vn_seqc_consistent(dvp, dvp_seqc)) { | |||||
vrele(dvp); | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
pwd = pwd_hold(curthread); | |||||
if (fpl->pwd != pwd) { | |||||
vrele(dvp); | |||||
pwd_drop(pwd); | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
fpl->ndp->ni_startdir = dvp; | |||||
Not Done Inline ActionsThis looks too rude. Why not act on partial result by fetching dp from the dedicated place ? kib: This looks too rude. Why not act on partial result by fetching dp from the dedicated place ? | |||||
Done Inline ActionsNot sure I follow. The lookup loop in namei starts with dp = ndp->ni_startdir, making the assignment below pretty natural in my opinion. Note partial returns go straight to the loop. mjg: Not sure I follow. The lookup loop in namei starts with dp = ndp->ni_startdir, making the… | |||||
return (0); | |||||
} | |||||
static int | |||||
cache_fplookup_final(struct cache_fpl *fpl) | |||||
{ | |||||
struct componentname *cnp; | |||||
enum vgetstate tvs; | |||||
struct vnode *dvp, *tvp; | |||||
seqc_t dvp_seqc, tvp_seqc; | |||||
int error; | |||||
cnp = fpl->cnp; | |||||
dvp = fpl->dvp; | |||||
dvp_seqc = fpl->dvp_seqc; | |||||
tvp = fpl->tvp; | |||||
tvp_seqc = fpl->tvp_seqc; | |||||
VNPASS(cache_fplookup_vnode_supported(dvp), dvp); | |||||
MPASS((cnp->cn_flags & LOCKLEAF) != 0); | |||||
tvs = vget_prep_smr(tvp); | |||||
if (tvs == VGET_NONE) { | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
if (!vn_seqc_consistent(dvp, dvp_seqc)) { | |||||
cache_fpl_smr_exit(fpl); | |||||
vget_abort(tvp, tvs); | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
cache_fpl_smr_exit(fpl); | |||||
error = vget_finish(tvp, cnp->cn_lkflags, tvs); | |||||
if (error != 0) { | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
if (!vn_seqc_consistent(tvp, tvp_seqc)) { | |||||
vput(tvp); | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
return (cache_fpl_handled(fpl, 0)); | |||||
} | |||||
static int | |||||
cache_fplookup_next(struct cache_fpl *fpl) | |||||
{ | |||||
struct componentname *cnp; | |||||
struct namecache *ncp; | |||||
struct vnode *dvp, *tvp; | |||||
u_char nc_flag; | |||||
uint32_t hash; | |||||
cnp = fpl->cnp; | |||||
dvp = fpl->dvp; | |||||
if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { | |||||
fpl->tvp = dvp; | |||||
fpl->tvp_seqc = vn_seqc_read_any(dvp); | |||||
if (seqc_in_modify(fpl->tvp_seqc)) { | |||||
return (cache_fpl_aborted(fpl)); | |||||
} | |||||
return (0); | |||||
} | |||||
hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); | |||||
CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { | |||||
counter_u64_add(numchecks, 1); | |||||
if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && | |||||
!bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) | |||||
break; | |||||
} | |||||
/* | |||||
* If there is no entry we have to punt to the slow path to perform | |||||
* actual lookup. Should there be nothing with this name a negative | |||||
* entry will be created. | |||||
*/ | |||||
if (__predict_false(ncp == NULL)) { | |||||
Not Done Inline ActionsWhy __predict_false ? Side note, this code seems to __predict-intensive overall. kib: Why __predict_false ?
Side note, this code seems to __predict-intensive overall. | |||||
Done Inline ActionsWe expect to have a cache hit of sorts for vast majority of calls. Similarly, we don't expect to find an invalidated entry for vast majority of lookups and so on. In comparison there is no predict for whether the entry is positive or negative. In general all predicts are there for corner cases which are only handled for correctness and which we expect to encounter sparingly. mjg: We expect to have a cache hit of sorts for vast majority of calls. Similarly, we don't expect… | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
tvp = atomic_load_ptr(&ncp->nc_vp); | |||||
nc_flag = atomic_load_char(&ncp->nc_flag); | |||||
if (__predict_false(cache_ncp_invalid(ncp))) { | |||||
Not Done Inline ActionsThis is weird. You load nc_flag (even with atomic) and then cache_ncp_invalid() reloads nc_flag and checks. kib: This is weird. You load nc_flag (even with atomic) and then cache_ncp_invalid() reloads… | |||||
Done Inline ActionsUnclear what the complaint is here. The code ensures that either all loads from ncp are performed before the entry is invalidated (making sure we got a good snapshot. On CPUs which reorder loads this in particular could fetch nc_flag first and nc_vp later. The acq fence + nc_flag reload ensures it's all fine regardless of how we ended up reading from ncp. mjg: Unclear what the complaint is here. The code ensures that either all loads from ncp are… | |||||
Not Done Inline ActionsI do not see why not apply the idiomatic way of using the acquire: you load tvp, then fence, then load nc_flag, and the check for invalid is performed on the value loaded by that read of nc_flag. This would require inlining cache_ncp_invalid() but this is the only thing that could be said against. Your code 'read nc_vp, read nc_flag, acq, re-read nc_flag' is strange (I am not saying wrong) because for the fence to have effect, the read must observe the rel-fenced write. There the value read before fence is used, potentially (but I am not sure) providing inconsistent state. Instead of trying to convince myself and future readers that this is innocent, it is much simpler and obviously correct to just use one load. kib: I do not see why not apply the idiomatic way of using the acquire: you load tvp, then fence… | |||||
Done Inline ActionsConcerns about previous state aside, the code got reorganized because of the rebase. Since NCP_HOTNEGATIVE flag got removed, the state has to be checked by: negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) == 0) ... which in the current patch is then followed by the fence + re-read. This leaves positive entry case with your concern, but I don't think this can be addressed without dubious reordering of the above -- we don't know what the entry is without first reading nc_flag. On the other hand I want to maintain the invariant that the entry ceased to be accessed past the invalidation check. mjg: Concerns about previous state aside, the code got reorganized because of the rebase.
Since… | |||||
Not Done Inline ActionsAnd I still do not see why not to reorg it by putting fence_acq() before load of nc_flag, and then use direct check for NCF_INVALID instead of cache_ncp_invalid(), in the current patch structure. And related question, why it is fine to check NCF_NEGATIVE and do anything before checking for NCF_INVALID. kib: And I still do not see why not to reorg it by putting fence_acq() before load of nc_flag, and… | |||||
Done Inline ActionsAs noted above, we don't know if the entry is negative without testing nc_flag. But it is negative, it has to be tested for NEG_HOT flag stored in a struct union'ed with nc_vp. With your proposal the code would have to look like this: nc_flag = ...; fence_acq(); if (nc_flag & NCP_NEGATIVE) { negstate = NCP2NEGSTATE(ncp); ... which reads negstate after we got the nc_flag snapshot to test. The invariant is that ncp only shows up in the hash once fully constructed and the state is fine as long as the entry did not turn out to be invalidated. Then the code has equivalent guarantees to locking the relevant chain, reading everything, unlocking which is de facto what locked lookup is doing. There is a slight difference that we don't ref the vnode inside, but that's taken care of with seqc + smr. mjg: As noted above, we don't know if the entry is negative without testing nc_flag. But it is… | |||||
Not Done Inline ActionsI do not see why cannot you read n_un before the fence, but ok. kib: I do not see why cannot you read n_un before the fence, but ok. | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
if (__predict_false(nc_flag & NCF_WHITE)) { | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
fpl->tvp = tvp; | |||||
if (nc_flag & NCF_NEGATIVE) { | |||||
if ((nc_flag & NCF_HOTNEGATIVE) == 0) { | |||||
Not Done Inline ActionsWhat is NCF_HOTNEGATIVE ? kib: What is NCF_HOTNEGATIVE ? | |||||
/* | |||||
* TODO | |||||
* Promoting to hot negative requires locks which are | |||||
* not yet supported for simplicity. | |||||
*/ | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, | |||||
ncp->nc_name); | |||||
counter_u64_add(numneghits, 1); | |||||
cache_fpl_smr_exit(fpl); | |||||
return (cache_fpl_handled(fpl, ENOENT)); | |||||
} | |||||
fpl->tvp_seqc = vn_seqc_read_any(tvp); | |||||
if (seqc_in_modify(fpl->tvp_seqc)) { | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
if (!cache_fplookup_vnode_supported(tvp)) { | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
counter_u64_add(numposhits, 1); | |||||
SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); | |||||
return (0); | |||||
} | |||||
static bool | |||||
cache_fplookup_mp_supported(struct mount *mp) | |||||
{ | |||||
if (mp == NULL) | |||||
return (false); | |||||
if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) | |||||
return (false); | |||||
if (mp->mnt_flag & MNT_UNION) | |||||
Not Done Inline Actions!= 0 kib: != 0 | |||||
return (false); | |||||
return (true); | |||||
} | |||||
/* | |||||
* Walk up the mount stack (if any). | |||||
* | |||||
* Correctness is provided in the following ways: | |||||
* - all vnodes are protected from freeing with SMR | |||||
* - struct mount objects are type stable making them always safe to access | |||||
* - stability of the particular mount is provided by busying it | |||||
* - relationship between the vnode which is mounted on and the mount is | |||||
* verified with the vnode sequence counter after busying | |||||
* - association between root vnode of the mount and the mount is protected | |||||
* by busy | |||||
* | |||||
* From that point on we can read the sequence counter of the root vnode | |||||
* and get the next mount on the stack (if any) using the same protection. | |||||
* | |||||
* By the end of successful walk we are guaranteed the reached state was | |||||
* indeed present at least at some point which matches the regular lookup. | |||||
*/ | |||||
static int | |||||
cache_fplookup_climb_mount(struct cache_fpl *fpl) | |||||
{ | |||||
struct mount *mp, *prev_mp; | |||||
struct vnode *vp; | |||||
seqc_t vp_seqc; | |||||
vp = fpl->tvp; | |||||
vp_seqc = fpl->tvp_seqc; | |||||
if (vp->v_type != VDIR) | |||||
return (0); | |||||
mp = atomic_load_ptr(&vp->v_mountedhere); | |||||
if (mp == NULL) | |||||
return (0); | |||||
prev_mp = NULL; | |||||
for (;;) { | |||||
if (!vfs_op_thread_enter(mp)) { | |||||
if (prev_mp != NULL) | |||||
vfs_op_thread_exit(prev_mp); | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
if (prev_mp != NULL) | |||||
vfs_op_thread_exit(prev_mp); | |||||
if (!vn_seqc_consistent(vp, vp_seqc)) { | |||||
vfs_op_thread_exit(mp); | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
if (!cache_fplookup_mp_supported(mp)) { | |||||
vfs_op_thread_exit(mp); | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
vp = atomic_load_ptr(&mp->mnt_rootvnode); | |||||
if (vp == NULL || VN_IS_DOOMED(vp)) { | |||||
vfs_op_thread_exit(mp); | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
vp_seqc = vn_seqc_read_any(vp); | |||||
if (seqc_in_modify(vp_seqc)) { | |||||
vfs_op_thread_exit(mp); | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
prev_mp = mp; | |||||
mp = atomic_load_ptr(&vp->v_mountedhere); | |||||
if (mp == NULL) | |||||
break; | |||||
} | |||||
vfs_op_thread_exit(prev_mp); | |||||
fpl->tvp = vp; | |||||
fpl->tvp_seqc = vp_seqc; | |||||
return (0); | |||||
} | |||||
/* | |||||
* Parse the path. | |||||
* | |||||
* The code is mostly copy-pasted from regular lookup, see lookup(). | |||||
* The structure is maintained along with comments for easier maintenance. | |||||
* Deduplicating the code will become feasible after fast path lookup | |||||
* becomes more feature-complete. | |||||
*/ | |||||
static int | |||||
cache_fplookup_parse(struct cache_fpl *fpl) | |||||
{ | |||||
struct nameidata *ndp; | |||||
struct componentname *cnp; | |||||
char *cp; | |||||
char *prev_ni_next; /* saved ndp->ni_next */ | |||||
size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ | |||||
ndp = fpl->ndp; | |||||
cnp = fpl->cnp; | |||||
/* | |||||
* Search a new directory. | |||||
* | |||||
* The last component of the filename is left accessible via | |||||
* cnp->cn_nameptr for callers that need the name. Callers needing | |||||
* the name set the SAVENAME flag. When done, they assume | |||||
* responsibility for freeing the pathname buffer. | |||||
*/ | |||||
for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) | |||||
continue; | |||||
cnp->cn_namelen = cp - cnp->cn_nameptr; | |||||
if (cnp->cn_namelen > NAME_MAX) { | |||||
cache_fpl_smr_exit(fpl); | |||||
return (cache_fpl_handled(fpl, ENAMETOOLONG)); | |||||
} | |||||
prev_ni_pathlen = ndp->ni_pathlen; | |||||
ndp->ni_pathlen -= cnp->cn_namelen; | |||||
KASSERT(ndp->ni_pathlen <= PATH_MAX, | |||||
("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); | |||||
prev_ni_next = ndp->ni_next; | |||||
ndp->ni_next = cp; | |||||
/* | |||||
* Replace multiple slashes by a single slash and trailing slashes | |||||
* by a null. This must be done before VOP_LOOKUP() because some | |||||
* fs's don't know about trailing slashes. Remember if there were | |||||
* trailing slashes to handle symlinks, existing non-directories | |||||
* and non-existing files that won't be directories specially later. | |||||
*/ | |||||
while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { | |||||
cp++; | |||||
ndp->ni_pathlen--; | |||||
if (*cp == '\0') { | |||||
/* | |||||
* TODO | |||||
* Regular lookup performs the following: | |||||
* *ndp->ni_next = '\0'; | |||||
* cnp->cn_flags |= TRAILINGSLASH; | |||||
* | |||||
* Which is problematic since it modifies data read | |||||
* from userspace. Then if fast path lookup was to | |||||
* abort we would have to either restore it or convey | |||||
* the flag. Since this is a corner case just ignore | |||||
* it for simplicity. | |||||
*/ | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
} | |||||
ndp->ni_next = cp; | |||||
cnp->cn_flags |= MAKEENTRY; | |||||
if (cnp->cn_namelen == 2 && | |||||
cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') | |||||
cnp->cn_flags |= ISDOTDOT; | |||||
else | |||||
cnp->cn_flags &= ~ISDOTDOT; | |||||
if (*ndp->ni_next == 0) | |||||
cnp->cn_flags |= ISLASTCN; | |||||
else | |||||
cnp->cn_flags &= ~ISLASTCN; | |||||
/* | |||||
* Check for degenerate name (e.g. / or "") | |||||
* which is a way of talking about a directory, | |||||
* e.g. like "/." or ".". | |||||
* | |||||
* TODO | |||||
* Another corner case handled by the regular lookup | |||||
*/ | |||||
if (__predict_false(cnp->cn_nameptr[0] == '\0')) { | |||||
return (cache_fpl_partial(fpl)); | |||||
} | |||||
return (0); | |||||
} | |||||
static void | |||||
cache_fplookup_parse_advance(struct cache_fpl *fpl) | |||||
{ | |||||
struct nameidata *ndp; | |||||
struct componentname *cnp; | |||||
ndp = fpl->ndp; | |||||
cnp = fpl->cnp; | |||||
cnp->cn_nameptr = ndp->ni_next; | |||||
while (*cnp->cn_nameptr == '/') { | |||||
cnp->cn_nameptr++; | |||||
ndp->ni_pathlen--; | |||||
} | |||||
} | |||||
static int | |||||
cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) | |||||
{ | |||||
struct nameidata *ndp; | |||||
struct componentname *cnp; | |||||
struct mount *mp; | |||||
int error; | |||||
error = CACHE_FPL_FAILED; | |||||
ndp = fpl->ndp; | |||||
ndp->ni_lcf = 0; | |||||
cnp = fpl->cnp; | |||||
cnp->cn_lkflags = LK_SHARED; | |||||
if ((cnp->cn_flags & LOCKSHARED) == 0) | |||||
cnp->cn_lkflags = LK_EXCLUSIVE; | |||||
cache_fpl_checkpoint(fpl, &fpl->snd); | |||||
fpl->dvp = dvp; | |||||
fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); | |||||
if (seqc_in_modify(fpl->dvp_seqc)) { | |||||
cache_fpl_aborted(fpl); | |||||
goto out; | |||||
} | |||||
mp = atomic_load_ptr(&fpl->dvp->v_mount); | |||||
if (!cache_fplookup_mp_supported(mp)) { | |||||
cache_fpl_aborted(fpl); | |||||
goto out; | |||||
} | |||||
VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); | |||||
for (;;) { | |||||
error = cache_fplookup_parse(fpl); | |||||
if (__predict_false(error != 0)) { | |||||
break; | |||||
} | |||||
if (cnp->cn_flags & ISDOTDOT) { | |||||
error = cache_fpl_partial(fpl); | |||||
break; | |||||
} | |||||
VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); | |||||
error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread); | |||||
if (__predict_false(error != 0)) { | |||||
switch (error) { | |||||
case EAGAIN: | |||||
case EOPNOTSUPP: /* can happen when racing against vgone */ | |||||
cache_fpl_partial(fpl); | |||||
break; | |||||
default: | |||||
/* | |||||
* See the API contract for VOP_FPLOOKUP_VEXEC. | |||||
*/ | |||||
if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { | |||||
error = cache_fpl_aborted(fpl); | |||||
} else { | |||||
cache_fpl_smr_exit(fpl); | |||||
cache_fpl_handled(fpl, error); | |||||
} | |||||
break; | |||||
} | |||||
break; | |||||
} | |||||
error = cache_fplookup_next(fpl); | |||||
if (__predict_false(error != 0)) { | |||||
break; | |||||
} | |||||
VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); | |||||
error = cache_fplookup_climb_mount(fpl); | |||||
if (__predict_false(error != 0)) { | |||||
break; | |||||
} | |||||
VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); | |||||
if (cnp->cn_flags & ISLASTCN) { | |||||
error = cache_fplookup_final(fpl); | |||||
break; | |||||
} | |||||
if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { | |||||
error = cache_fpl_aborted(fpl); | |||||
break; | |||||
} | |||||
fpl->dvp = fpl->tvp; | |||||
fpl->dvp_seqc = fpl->tvp_seqc; | |||||
cache_fplookup_parse_advance(fpl); | |||||
cache_fpl_checkpoint(fpl, &fpl->snd); | |||||
} | |||||
out: | |||||
switch (fpl->status) { | |||||
case CACHE_FPL_STATUS_UNSET: | |||||
__assert_unreachable(); | |||||
break; | |||||
case CACHE_FPL_STATUS_PARTIAL: | |||||
cache_fpl_smr_assert_entered(fpl); | |||||
return (cache_fplookup_partial_setup(fpl)); | |||||
case CACHE_FPL_STATUS_ABORTED: | |||||
if (fpl->in_smr) | |||||
cache_fpl_smr_exit(fpl); | |||||
return (CACHE_FPL_FAILED); | |||||
case CACHE_FPL_STATUS_HANDLED: | |||||
cache_fpl_smr_assert_not_entered(fpl); | |||||
if (__predict_false(error != 0)) { | |||||
ndp->ni_dvp = NULL; | |||||
ndp->ni_vp = NULL; | |||||
cache_fpl_cleanup_cnp(cnp); | |||||
return (error); | |||||
} | |||||
ndp->ni_dvp = fpl->dvp; | |||||
ndp->ni_vp = fpl->tvp; | |||||
if (cnp->cn_flags & SAVENAME) | |||||
cnp->cn_flags |= HASBUF; | |||||
else | |||||
cache_fpl_cleanup_cnp(cnp); | |||||
return (error); | |||||
} | |||||
} | |||||
/* | |||||
* Fast path lookup protected with SMR and sequence counters. | |||||
* | |||||
* Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. | |||||
* | |||||
* Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria | |||||
* outlined below. | |||||
* | |||||
* Traditional vnode lookup conceptually looks like this: | |||||
* | |||||
* vn_lock(current); | |||||
* for (;;) { | |||||
* next = find(); | |||||
* vn_lock(next); | |||||
* vn_unlock(current); | |||||
* current = next; | |||||
* if (last) | |||||
* break; | |||||
* } | |||||
* | |||||
* Each jump to the next vnode is safe memory-wise and atomic with respect to | |||||
* any modifications thanks to holding respective locks. | |||||
* | |||||
* The same guarantee can be provided with a combination of safe memory | |||||
* reclamation and sequence counters instead. If all operations which affect | |||||
* the relationship between the current vnode and the one we are looking for | |||||
* also modify the counter, we can verify whether all the conditions held as | |||||
* we made the jump. This includes things like permissions, mount points etc. | |||||
* In order to provide the guarantee all aforementioned places are enclosed by | |||||
* vn_seqc_write_begin()/end(). | |||||
* | |||||
* Thus this translates to: | |||||
* | |||||
* vfs_smr_enter(); | |||||
* current_seqc = seqc_read_any(current); | |||||
Done Inline Actions'You can grep' sounds frivolous. All places that may modify data affecting lockless lookup are enclosed in vn_seqc_write_begin()/end() braces. kib: 'You can grep' sounds frivolous.
`All places that may modify data affecting lockless lookup… | |||||
* if (seqc_in_modify(current_seqc)) // someone is altering the vnode | |||||
* abort(); | |||||
* for (;;) { | |||||
* next = find(); | |||||
* next_seqc = seqc_read_any(next); | |||||
* if (!seqc_consistent(current, current_seqc) // someone is altering the vnode | |||||
* abort(); | |||||
* current = next; // we know nothing of importance has changed | |||||
* current_seqc = next_seqc; // store the counter for the next iteration | |||||
* if (last) | |||||
* break; | |||||
* } | |||||
* | |||||
* API contract for VOP_FPLOOKUP_VEXEC routines is as follows: | |||||
* - they are called while within vfs_smr protection which they must never exit | |||||
* - EAGAIN can be returned to denote checking could not be performed, it is | |||||
* always valid to return it | |||||
* - if the sequence counter has not changed the result must be valid | |||||
* - if the sequence counter has changed both false positives and false negatives | |||||
* are permitted (since the result will be rejected later) | |||||
* - for simple cases of unix permission checks vaccess_vexec_smr can be used | |||||
* | |||||
* Caveats to watch out for: | |||||
* - vnodes are passed unlocked and unreferenced with nothing stopping | |||||
* VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised | |||||
* to use atomic_load_ptr to fetch it. | |||||
* - aforementioned object can also get freed, meaning absent other means it | |||||
* should be protected with vfs_smr | |||||
* - either safely checking permissions as they are modified or guaranteeing | |||||
* their stability is left to the routine | |||||
*/ | |||||
int | |||||
cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, | |||||
struct pwd **pwdp) | |||||
{ | |||||
struct cache_fpl fpl; | |||||
struct pwd *pwd; | |||||
struct vnode *dvp; | |||||
struct componentname *cnp; | |||||
struct nameidata_saved orig; | |||||
int error; | |||||
*status = CACHE_FPL_STATUS_UNSET; | |||||
bzero(&fpl, sizeof(fpl)); | |||||
fpl.status = CACHE_FPL_STATUS_UNSET; | |||||
fpl.ndp = ndp; | |||||
fpl.cnp = &ndp->ni_cnd; | |||||
MPASS(curthread == fpl.cnp->cn_thread); | |||||
if (!cache_can_fplookup(&fpl)) { | |||||
SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); | |||||
*status = fpl.status; | |||||
return (EOPNOTSUPP); | |||||
} | |||||
cache_fpl_checkpoint(&fpl, &orig); | |||||
cache_fpl_smr_enter(&fpl); | |||||
pwd = pwd_get_smr(); | |||||
fpl.pwd = pwd; | |||||
ndp->ni_rootdir = pwd->pwd_rdir; | |||||
ndp->ni_topdir = pwd->pwd_jdir; | |||||
cnp = fpl.cnp; | |||||
cnp->cn_nameptr = cnp->cn_pnbuf; | |||||
if (cnp->cn_pnbuf[0] == '/') { | |||||
cache_fpl_handle_root(ndp, &dvp); | |||||
} else { | |||||
MPASS(ndp->ni_dirfd == AT_FDCWD); | |||||
dvp = pwd->pwd_cdir; | |||||
} | |||||
SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); | |||||
error = cache_fplookup_impl(dvp, &fpl); | |||||
cache_fpl_smr_assert_not_entered(&fpl); | |||||
SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); | |||||
*status = fpl.status; | |||||
switch (fpl.status) { | |||||
case CACHE_FPL_STATUS_UNSET: | |||||
__assert_unreachable(); | |||||
break; | |||||
case CACHE_FPL_STATUS_HANDLED: | |||||
SDT_PROBE3(vfs, namei, lookup, return, error, | |||||
(error == 0 ? ndp->ni_vp : NULL), true); | |||||
break; | |||||
case CACHE_FPL_STATUS_PARTIAL: | |||||
*pwdp = fpl.pwd; | |||||
cache_fpl_restore(&fpl, &fpl.snd); | |||||
break; | |||||
case CACHE_FPL_STATUS_ABORTED: | |||||
cache_fpl_restore(&fpl, &orig); | |||||
break; | |||||
} | |||||
return (error); | |||||
} |
So this is exact copy on namei_cleanup_cnp. Why ?