diff --git a/sys/kern/vfs_acl.c b/sys/kern/vfs_acl.c index dffce9d29291..e5470bd11438 100644 --- a/sys/kern/vfs_acl.c +++ b/sys/kern/vfs_acl.c @@ -1,598 +1,601 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999-2006, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Developed by the TrustedBSD Project. * * ACL system calls and other functions common across different ACL types. * Type-specific routines go into subr_acl_.c. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES); MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists"); static int kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow); static int kern___acl_delete_path(struct thread *td, const char *path, acl_type_t type, int follow); static int kern___acl_get_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow); static int kern___acl_set_path(struct thread *td, const char *path, acl_type_t type, const struct acl *aclp, int follow); static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp); static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp); static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp); int acl_copy_oldacl_into_acl(const struct oldacl *source, struct acl *dest) { int i; if (source->acl_cnt < 0 || source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; dest->acl_maxcnt = ACL_MAX_ENTRIES; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } int acl_copy_acl_into_oldacl(const struct acl *source, struct oldacl *dest) { int i; if (source->acl_cnt > OLDACL_MAX_ENTRIES) return (EINVAL); bzero(dest, sizeof(*dest)); dest->acl_cnt = source->acl_cnt; for (i = 0; i < dest->acl_cnt; i++) { dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag; dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id; dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm; } return (0); } /* * At one time, "struct ACL" was extended in order to add support for NFSv4 * ACLs. Instead of creating compatibility versions of all the ACL-related * syscalls, they were left intact. It's possible to find out what the code * calling these syscalls (libc) expects basing on "type" argument - if it's * either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were * known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct * oldacl". If it's something else, then it's the new "struct acl". In the * latter case, the routines below just copyin/copyout the contents. In the * former case, they copyin the "struct oldacl" and convert it to the new * format. */ static int acl_copyin(const void *user_acl, struct acl *kernel_acl, acl_type_t type) { int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = copyin(user_acl, &old, sizeof(old)); if (error != 0) break; acl_copy_oldacl_into_acl(&old, kernel_acl); break; default: error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl)); if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES) return (EINVAL); } return (error); } static int acl_copyout(const struct acl *kernel_acl, void *user_acl, acl_type_t type) { uint32_t am; int error; struct oldacl old; switch (type) { case ACL_TYPE_ACCESS_OLD: case ACL_TYPE_DEFAULT_OLD: error = acl_copy_acl_into_oldacl(kernel_acl, &old); if (error != 0) break; error = copyout(&old, user_acl, sizeof(old)); break; default: error = fueword32((char *)user_acl + offsetof(struct acl, acl_maxcnt), &am); if (error == -1) return (EFAULT); if (am != ACL_MAX_ENTRIES) return (EINVAL); error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl)); } return (error); } /* * Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new" * counterpart. It's required for old (pre-NFSv4 ACLs) libc to work * with new kernel. Fixing 'type' for old binaries with new libc * is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold(). */ static int acl_type_unold(int type) { switch (type) { case ACL_TYPE_ACCESS_OLD: return (ACL_TYPE_ACCESS); case ACL_TYPE_DEFAULT_OLD: return (ACL_TYPE_DEFAULT); default: return (type); } } /* * These calls wrap the real vnode operations, and are called by the syscall * code once the syscall has converted the path or file descriptor to a vnode * (unlocked). The aclp pointer is assumed still to point to userland, so * this should not be consumed within the kernel except by syscall code. * Other code should directly invoke VOP_{SET,GET}ACL. */ /* * Given a vnode, set its ACL. */ static int vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp) { struct acl *inkernelacl; struct mount *mp; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) goto out; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl); if (error != 0) goto out_unlock; #endif error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out_unlock: #endif VOP_UNLOCK(vp); vn_finished_write(mp); out: acl_free(inkernelacl); return (error); } /* * Given a vnode, get its ACL. */ static int vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type, struct acl *aclp) { struct acl *inkernelacl; int error; AUDIT_ARG_VALUE(type); inkernelacl = acl_alloc(M_WAITOK | M_ZERO); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_getacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp); if (error == 0) error = acl_copyout(inkernelacl, aclp, type); acl_free(inkernelacl); return (error); } /* * Given a vnode, delete its ACL. */ static int vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type) { struct mount *mp; int error; AUDIT_ARG_VALUE(type); error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); #ifdef MAC error = mac_vnode_check_deleteacl(td->td_ucred, vp, type); if (error != 0) goto out; #endif error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td); #ifdef MAC out: #endif VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Given a vnode, check whether an ACL is appropriate for it * * XXXRW: No vnode lock held so can't audit vnode state...? */ static int vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type, const struct acl *aclp) { struct acl *inkernelacl; int error; inkernelacl = acl_alloc(M_WAITOK); error = acl_copyin(aclp, inkernelacl, type); if (error != 0) goto out; error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl, td->td_ucred, td); out: acl_free(inkernelacl); return (error); } /* * syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't * need to lock, as the vacl_ code will get/release any locks required. */ /* * Given a file path, get an ACL for it */ int sys___acl_get_file(struct thread *td, struct __acl_get_file_args *uap) { return (kern___acl_get_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, get an ACL for it; don't follow links. */ int sys___acl_get_link(struct thread *td, struct __acl_get_link_args *uap) { return(kern___acl_get_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_get_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error == 0) { error = vacl_get_acl(td, nd.ni_vp, type, aclp); - NDFREE(&nd, 0); + vrele(nd.ni_vp); + NDFREE_PNBUF(&nd); } return (error); } /* * Given a file path, set an ACL for it. */ int sys___acl_set_file(struct thread *td, struct __acl_set_file_args *uap) { return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, set an ACL for it; don't follow links. */ int sys___acl_set_link(struct thread *td, struct __acl_set_link_args *uap) { return(kern___acl_set_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_set_path(struct thread *td, const char *path, acl_type_t type, const struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error == 0) { error = vacl_set_acl(td, nd.ni_vp, type, aclp); - NDFREE(&nd, 0); + vrele(nd.ni_vp); + NDFREE_PNBUF(&nd); } return (error); } /* * Given a file descriptor, get an ACL for it. */ int sys___acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_GET), &fp); if (error == 0) { error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file descriptor, set an ACL for it. */ int sys___acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_SET), &fp); if (error == 0) { error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap) { return (kern___acl_delete_path(td, uap->path, uap->type, FOLLOW)); } /* * Given a file path, delete an ACL from it; don't follow links. */ int sys___acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap) { return (kern___acl_delete_path(td, uap->path, uap->type, NOFOLLOW)); } static int kern___acl_delete_path(struct thread *td, const char *path, acl_type_t type, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path); error = namei(&nd); if (error == 0) { error = vacl_delete(td, nd.ni_vp, type); - NDFREE(&nd, 0); + vrele(nd.ni_vp); + NDFREE_PNBUF(&nd); } return (error); } /* * Given a file path, delete an ACL from it. */ int sys___acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_DELETE), &fp); if (error == 0) { error = vacl_delete(td, fp->f_vnode, uap->type); fdrop(fp, td); } return (error); } /* * Given a file path, check an ACL for it. */ int sys___acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap) { return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, FOLLOW)); } /* * Given a file path, check an ACL for it; don't follow links. */ int sys___acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap) { return (kern___acl_aclcheck_path(td, uap->path, uap->type, uap->aclp, NOFOLLOW)); } static int kern___acl_aclcheck_path(struct thread *td, const char *path, acl_type_t type, struct acl *aclp, int follow) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, follow, UIO_USERSPACE, path); error = namei(&nd); if (error == 0) { error = vacl_aclcheck(td, nd.ni_vp, type, aclp); - NDFREE(&nd, 0); + NDFREE_PNBUF(&nd); } return (error); } /* * Given a file descriptor, check an ACL for it. */ int sys___acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->filedes); error = getvnode(td, uap->filedes, cap_rights_init_one(&rights, CAP_ACL_CHECK), &fp); if (error == 0) { error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp); fdrop(fp, td); } return (error); } struct acl * acl_alloc(int flags) { struct acl *aclp; aclp = malloc(sizeof(*aclp), M_ACL, flags); if (aclp == NULL) return (NULL); aclp->acl_maxcnt = ACL_MAX_ENTRIES; return (aclp); } void acl_free(struct acl *aclp) { free(aclp, M_ACL); } diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index e58d3f338085..08d7ab2895cd 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1,6126 +1,6128 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #include #ifdef DDB #include #endif #include /* * High level overview of name caching in the VFS layer. * * Originally caching was implemented as part of UFS, later extracted to allow * use by other filesystems. A decision was made to make it optional and * completely detached from the rest of the kernel, which comes with limitations * outlined near the end of this comment block. * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained * in comments placed above their implementation. Scattered thoroughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they * have to be checked by reading the code and associated commentary. * * Keep in mind that it's individual path components which are cached, not full * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, * one for each name. * * I. Data organization * * Entries are described by "struct namecache" objects and stored in a hash * table. See cache_get_hash for more information. * * "struct vnode" contains pointers to source entries (names which can be found * when traversing through said vnode), destination entries (names of that * vnode (see "Limitations" for a breakdown on the subject) and a pointer to * the parent vnode. * * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * * Since there are no small locks at this time (all are 32 bytes in size on * LP64), the code works around the problem by introducing lock arrays to * protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup * - set vop_cachedlookup to whatever can perform the lookup if the above fails * - if they support lockless lookup (see below), vop_fplookup_vexec and * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the * mount point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag * * With the above in mind, there are 2 entry points when doing lookups: * - ... -> namei -> cache_fplookup -- this is the default * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei * should the above fail * * Example code flow how an entry is added: * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of * different files don't suffer any scalability problems in the namecache. * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * * At least on amd64 the single-threaded bottleneck for long paths is hashing * (see cache_get_hash). There are cases where the code issues acquire fence * multiple times, they can be combined on architectures which suffer from it. * * For locked case each encountered vnode has to be referenced and locked in * order to be handed out to the caller (normally that's namei). This * introduces significant hit single-threaded and serialization multi-threaded. * * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- * avoids any writes to shared areas to any components. * * Unrelated insertions are partially serialized on updating the global entry * counter and possibly serialized on colliding bucket or vnode locks. * * IV. Observability * * Note not everything has an explicit dtrace probe nor it should have, thus * some of the one-liners below depend on implementation details. * * Examples: * * # Check what lookups failed to be handled in a lockless manner. Column 1 is * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * * # Lengths of names added by binary name * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' * * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what * # path is it * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' * * V. Limitations and implementation defects * * - since it is possible there is no entry for an open file, tools like * "procstat" may fail to resolve fd -> vnode -> path to anything * - even if a filesystem adds an entry, it may get purged (e.g., due to memory * shortage) in which case the above problem applies * - hardlinks are not tracked, thus if a vnode is reachable in more than one * way, resolving a name may return a different path than the one used to * open it (even if said path is still valid) * - by default entries are not added for newly created files * - adding an entry may need to evict negative entry first, which happens in 2 * distinct places (evicting on lookup, adding in a later VOP) making it * impossible to simply reuse it * - there is a simple scheme to evict negative entries as the cache is approaching * its capacity, but it is very unclear if doing so is a good idea to begin with * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately * - struct namecache has a fixed size and comes in 2 variants, often wasting space. * now hard to replace with malloc due to dependence on SMR. * - lack of better integration with the kernel also turns nullfs into a layered * filesystem instead of something which can take advantage of caching */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache"); SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", "const char *"); SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", "struct namecache *", "int", "int"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); static char __read_frequently cache_fast_lookup_enabled = true; /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; u_char neg_hit; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ int nc_pad; struct namecache nc_nc; }; TAILQ_HEAD(cache_freebatch, namecache); /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) /* * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the * 4.4 BSD codebase. Later on struct namecache was tweaked to become * smaller and the value was bumped to retain the total size, but it * was never re-evaluated for suitability. A simple test counting * lengths during package building shows that the value of 45 covers * about 86% of all added entries, reaching 99% at 65. * * Regardless of the above, use of dedicated zones instead of malloc may be * inducing additional waste. This may be hard to address as said zones are * tied to VFS SMR. Even if retaining them, the current split should be * re-evaluated. */ #ifdef __LP64__ #define CACHE_PATH_CUTOFF 45 #define CACHE_LARGE_PAD 6 #else #define CACHE_PATH_CUTOFF 41 #define CACHE_LARGE_PAD 2 #endif #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 static bool cache_neg_evict_cond(u_long lnumcache); /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ #define cache_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ }) /* * Like the above but also checks NCF_WHITE. */ #define cache_fpl_neg_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ }) VFS_SMR_DECLARE; static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache parameters"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, "Total namecache capacity"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); /* * Negative entry % of namecache capacity above which automatic eviction is allowed. * * Check cache_neg_evict_cond for details. */ static u_int ncnegminpct = 3; static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, "Negative entry count above which automatic eviction is allowed"); /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ struct nchstats nchstats; /* cache effectiveness statistics */ static u_int __exclusive_cache_line neg_cycle; #define ncneghash 3 #define numneglists (ncneghash + 1) struct neglist { struct mtx nl_evict_lock; struct mtx nl_lock __aligned(CACHE_LINE_SIZE); TAILQ_HEAD(, namecache) nl_list; TAILQ_HEAD(, namecache) nl_hotlist; u_long nl_hotnum; } __aligned(CACHE_LINE_SIZE); static struct neglist neglists[numneglists]; static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct mtx_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_time; *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define STATNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); STATNODE_ULONG(count, numcache, "Number of cache entries"); STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(posszaps, numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(negzaps, numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(fullpathfail2, numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); /* * Debug or developer statistics. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache debugging"); #define DEBUGNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define DEBUGNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, "Number of successful removals after relocking"); static long zap_bucket_fail; DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); static long zap_bucket_fail2; DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); static long cache_lock_vnodes_cel_3_failures; DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend); static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * Directory vnodes with entries are held for two reasons: * 1. make them less of a target for reclamation in vnlru * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided * * It will be feasible to stop doing it altogether if all filesystems start * supporting lockless lookup. */ static void cache_hold_vnode(struct vnode *vp) { cache_assert_vnode_locked(vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); vhold(vp); counter_u64_add(numcachehv, 1); } static void cache_drop_vnode(struct vnode *vp) { /* * Called after all locks are dropped, meaning we can't assert * on the state of v_cache_src. */ vdrop(vp); counter_u64_add(numcachehv, -1); } /* * UMA zones. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; char * cache_symlink_alloc(size_t size, int flags) { if (size < CACHE_ZONE_SMALL_SIZE) { return (uma_zalloc_smr(cache_zone_small, flags)); } if (size < CACHE_ZONE_LARGE_SIZE) { return (uma_zalloc_smr(cache_zone_large, flags)); } counter_u64_add(symlinktoobig, 1); SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); return (NULL); } void cache_symlink_free(char *string, size_t size) { MPASS(string != NULL); KASSERT(size < CACHE_ZONE_LARGE_SIZE, ("%s: size %zu too big", __func__, size)); if (size < CACHE_ZONE_SMALL_SIZE) { uma_zfree_smr(cache_zone_small, string); return; } if (size < CACHE_ZONE_LARGE_SIZE) { uma_zfree_smr(cache_zone_large, string); return; } __assert_unreachable(); } static struct namecache * cache_alloc_uma(int len, bool ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free_uma(struct namecache *ncp) { struct namecache_ts *ncp_ts; if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static struct namecache * cache_alloc(int len, bool ts) { u_long lnumcache; /* * Avoid blowout in namecache entries. * * Bugs: * 1. filesystems may end up trying to add an already existing entry * (for example this can happen after a cache miss during concurrent * lookup), in which case we will call cache_neg_evict despite not * adding anything. * 2. the routine may fail to free anything and no provisions are made * to make it try harder (see the inside for failure modes) * 3. it only ever looks at negative entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (cache_neg_evict_cond(lnumcache)) { lnumcache = atomic_load_long(&numcache); } if (__predict_false(lnumcache >= ncsize)) { atomic_subtract_long(&numcache, 1); counter_u64_add(numdrops, 1); return (NULL); } return (cache_alloc_uma(len, ts)); } static void cache_free(struct namecache *ncp) { MPASS(ncp != NULL); if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); atomic_subtract_long(&numcache, 1); } static void cache_free_batch(struct cache_freebatch *batch) { struct namecache *ncp, *nnp; int i; i = 0; if (TAILQ_EMPTY(batch)) goto out; TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); i++; } atomic_subtract_long(&numcache, i); out: SDT_PROBE1(vfs, namecache, purge, batch, i); } /* * Hashing. * * The code was made to use FNV in 2001 and this choice needs to be revisited. * * Short summary of the difficulty: * The longest name which can be inserted is NAME_MAX characters in length (or * 255 at the time of writing this comment), while majority of names used in * practice are significantly shorter (mostly below 10). More importantly * majority of lookups performed find names are even shorter than that. * * This poses a problem where hashes which do better than FNV past word size * (or so) tend to come with additional overhead when finalizing the result, * making them noticeably slower for the most commonly used range. * * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c * * When looking it up the most time consuming part by a large margin (at least * on amd64) is hashing. Replacing FNV with something which pessimizes short * input would make the slowest part stand out even more. */ /* * TODO: With the value stored we can do better than computing the hash based * on the address. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static uint32_t cache_get_hash_iter_start(struct vnode *dvp) { return (dvp->v_nchash); } static uint32_t cache_get_hash_iter(char c, uint32_t hash) { return (fnv_32_buf(&c, 1, hash)); } static uint32_t cache_get_hash_iter_finish(uint32_t hash) { return (hash); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct mtx * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_OWNED); } static void cache_assert_bucket_unlocked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_NOTOWNED); } #else #define cache_assert_bucket_locked(x) do { } while (0) #define cache_assert_bucket_unlocked(x) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_lock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_unlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); static void cache_recalc_neg_min(u_int val) { neg_min = (ncsize * val) / 100; } static int sysctl_negminpct(SYSCTL_HANDLER_ARGS) { u_int val; int error; val = ncnegminpct; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == ncnegminpct) return (0); if (val < 0 || val > 99) return (EINVAL); ncnegminpct = val; cache_recalc_neg_min(val); return (0); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); #ifdef DEBUG_CACHE /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * Various workloads create plenty of negative entries and barely use them * afterwards. Moreover malicious users can keep performing bogus lookups * adding even more entries. For example "make tinderbox" as of writing this * comment ends up with 2.6M namecache entries in total, 1.2M of which are * negative. * * As such, a rather aggressive eviction method is needed. The currently * employed method is a placeholder. * * Entries are split over numneglists separate lists, each of which is further * split into hot and cold entries. Entries get promoted after getting a hit. * Eviction happens on addition of new entry. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache negative entry statistics"); SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, "Number of negative cache entries"); static COUNTER_U64_DEFINE_EARLY(neg_created); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, "Number of created negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evicted); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, "Number of evicted negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, &neg_evict_skipped_empty, "Number of times evicting failed due to lack of entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, &neg_evict_skipped_missed, "Number of times evicting failed due to target entry disappearing"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, &neg_evict_skipped_contended, "Number of times evicting failed due to contention"); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, "Number of cache hits (negative)"); static int sysctl_neg_hot(SYSCTL_HANDLER_ARGS) { int i, out; out = 0; for (i = 0; i < numneglists; i++) out += neglists[i].nl_hotnum; return (SYSCTL_OUT(req, &out, sizeof(out))); } SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", "Number of hot negative entries"); static void cache_neg_init(struct namecache *ncp) { struct negstate *ns; ncp->nc_flag |= NCF_NEGATIVE; ns = NCP2NEGSTATE(ncp); ns->neg_flag = 0; ns->neg_hit = 0; counter_u64_add(neg_created, 1); } #define CACHE_NEG_PROMOTION_THRESH 2 static bool cache_neg_hit_prep(struct namecache *ncp) { struct negstate *ns; u_char n; ns = NCP2NEGSTATE(ncp); n = atomic_load_char(&ns->neg_hit); for (;;) { if (n >= CACHE_NEG_PROMOTION_THRESH) return (false); if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) break; } return (n + 1 == CACHE_NEG_PROMOTION_THRESH); } /* * Nothing to do here but it is provided for completeness as some * cache_neg_hit_prep callers may end up returning without even * trying to promote. */ #define cache_neg_hit_abort(ncp) do { } while (0) static void cache_neg_hit_finish(struct namecache *ncp) { SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); counter_u64_add(numneghits, 1); } /* * Move a negative entry to the hot list. */ static void cache_neg_promote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); if ((ns->neg_flag & NEG_HOT) == 0) { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum++; ns->neg_flag |= NEG_HOT; } } /* * Move a hot negative entry to the cold list. */ static void cache_neg_demote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); MPASS(ns->neg_flag & NEG_HOT); TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); nl->nl_hotnum--; ns->neg_flag &= ~NEG_HOT; atomic_store_char(&ns->neg_hit, 0); } /* * Move a negative entry to the hot list if it matches the lookup. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Exiting the section means * the found entry could have been evicted. We are going to look it * up again. */ static bool cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, struct namecache *oncp, uint32_t hash) { struct namecache *ncp; struct neglist *nl; u_char nc_flag; nl = NCP2NEGLIST(oncp); mtx_lock(&nl->nl_lock); /* * For hash iteration. */ vfs_smr_enter(); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * XXX There are no provisions to keep the vnode around, meaning we may * end up promoting a negative entry for a *new* vnode and returning * ENOENT on its account. This is the error we want to return anyway * and promotion is harmless. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (!cache_ncp_canuse(ncp)) { goto out_abort; } cache_neg_promote_locked(ncp); cache_neg_hit_finish(ncp); vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (true); out_abort: vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (false); } static void cache_neg_promote(struct namecache *ncp) { struct neglist *nl; nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); cache_neg_promote_locked(ncp); mtx_unlock(&nl->nl_lock); } static void cache_neg_insert(struct namecache *ncp) { struct neglist *nl; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); mtx_unlock(&nl->nl_lock); atomic_add_long(&numneg, 1); } static void cache_neg_remove(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); ns = NCP2NEGSTATE(ncp); mtx_lock(&nl->nl_lock); if ((ns->neg_flag & NEG_HOT) != 0) { TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum--; } else { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); } mtx_unlock(&nl->nl_lock); atomic_subtract_long(&numneg, 1); } static struct neglist * cache_neg_evict_select_list(void) { struct neglist *nl; u_int c; c = atomic_fetchadd_int(&neg_cycle, 1) + 1; nl = &neglists[c % numneglists]; if (!mtx_trylock(&nl->nl_evict_lock)) { counter_u64_add(neg_evict_skipped_contended, 1); return (NULL); } return (nl); } static struct namecache * cache_neg_evict_select_entry(struct neglist *nl) { struct namecache *ncp, *lncp; struct negstate *ns, *lns; int i; mtx_assert(&nl->nl_evict_lock, MA_OWNED); mtx_assert(&nl->nl_lock, MA_OWNED); ncp = TAILQ_FIRST(&nl->nl_list); if (ncp == NULL) return (NULL); lncp = ncp; lns = NCP2NEGSTATE(lncp); for (i = 1; i < 4; i++) { ncp = TAILQ_NEXT(ncp, nc_dst); if (ncp == NULL) break; ns = NCP2NEGSTATE(ncp); if (ns->neg_hit < lns->neg_hit) { lncp = ncp; lns = ns; } } return (lncp); } static bool cache_neg_evict(void) { struct namecache *ncp, *ncp2; struct neglist *nl; struct vnode *dvp; struct mtx *dvlp; struct mtx *blp; uint32_t hash; u_char nlen; bool evicted; nl = cache_neg_evict_select_list(); if (nl == NULL) { return (false); } mtx_lock(&nl->nl_lock); ncp = TAILQ_FIRST(&nl->nl_hotlist); if (ncp != NULL) { cache_neg_demote_locked(ncp); } ncp = cache_neg_evict_select_entry(nl); if (ncp == NULL) { counter_u64_add(neg_evict_skipped_empty, 1); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); return (false); } nlen = ncp->nc_nlen; dvp = ncp->nc_dvp; hash = cache_get_hash(ncp->nc_name, nlen, dvp); dvlp = VP2VNODELOCK(dvp); blp = HASH2BUCKETLOCK(hash); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); mtx_lock(dvlp); mtx_lock(blp); /* * Note that since all locks were dropped above, the entry may be * gone or reallocated to be something else. */ CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { if (ncp2 == ncp && ncp2->nc_dvp == dvp && ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) break; } if (ncp2 == NULL) { counter_u64_add(neg_evict_skipped_missed, 1); ncp = NULL; evicted = false; } else { MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); MPASS(blp == NCP2BUCKETLOCK(ncp)); SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(neg_evicted, 1); evicted = true; } mtx_unlock(blp); mtx_unlock(dvlp); if (ncp != NULL) cache_free(ncp); return (evicted); } /* * Maybe evict a negative entry to create more room. * * The ncnegfactor parameter limits what fraction of the total count * can comprise of negative entries. However, if the cache is just * warming up this leads to excessive evictions. As such, ncnegminpct * (recomputed to neg_min) dictates whether the above should be * applied. * * Try evicting if the cache is close to full capacity regardless of * other considerations. */ static bool cache_neg_evict_cond(u_long lnumcache) { u_long lnumneg; if (ncsize - 1000 < lnumcache) goto out_evict; lnumneg = atomic_load_long(&numneg); if (lnumneg < neg_min) return (false); if (lnumneg * ncnegfactor < lnumcache) return (false); out_evict: return (cache_neg_evict()); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; struct vnode *dvp, *vp; dvp = ncp->nc_dvp; vp = ncp->nc_vp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(vp); cache_assert_vnode_locked(dvp); cache_assert_bucket_locked(ncp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); if (ncp == vp->v_cache_dd) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); cache_neg_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == dvp->v_cache_dd) { atomic_store_ptr(&dvp->v_cache_dd, NULL); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; } } } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct mtx *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct mtx *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct mtx *blp) { struct namecache *rncp; cache_assert_bucket_unlocked(ncp); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); mtx_lock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) break; } if (rncp != NULL) { cache_zap_locked(rncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); counter_u64_add(zap_bucket_relock_success, 1); return (0); } mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct mtx *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; mtx_unlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static __noinline int cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { atomic_store_ptr(&dvp->v_cache_dd, NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); return (1); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (ncp == NULL) { mtx_unlock(blp); goto out_no_entry; } error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail++; goto retry; } counter_u64_add(numposzaps, 1); SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); cache_free(ncp); return (1); out_no_entry: counter_u64_add(nummisszap, 1); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static int __noinline cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct mtx *dvlp; enum vgetstate vs; int error, ltype; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) != 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { cache_remove_cnp(dvp, cnp); return (0); } counter_u64_add(dotdothits, 1); retry: dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; if (*vpp == NULL) goto negative_success; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } MPASS(dvp != *vpp); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); vs = vget_prep(*vpp); mtx_unlock(dvlp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); cache_zap_negative_locked_vnode_kl(ncp, dvp); mtx_unlock(dvlp); cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(dvlp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * Either both tsp and ticks have to be provided or neither of them. * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ static int __noinline cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; enum vgetstate vs; int error; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) == 0); MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); retry: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { mtx_unlock(blp); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); vs = vget_prep(*vpp); mtx_unlock(blp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: /* * We don't get here with regular lookup apart from corner cases. */ if (__predict_true(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail2++; goto retry; } cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(blp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; enum vgetstate vs; int error; bool whiteout, neg_promote; u_short nc_flag; MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); } MPASS((cnp->cn_flags & ISDOTDOT) == 0); if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { cache_remove_cnp(dvp, cnp); return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); vfs_smr_enter(); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { vfs_smr_exit(); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } nc_flag = atomic_load_char(&ncp->nc_flag); if (nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto out_fallback; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto out_fallback; } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto out_fallback; } return (-1); negative_success: if (cnp->cn_nameiop == CREATE) { if (cnp->cn_flags & ISLASTCN) { vfs_smr_exit(); goto out_fallback; } } cache_out_ts(ncp, tsp, ticksp); whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); neg_promote = cache_neg_hit_prep(ncp); if (!cache_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); vfs_smr_exit(); goto out_fallback; } if (neg_promote) { vfs_smr_exit(); if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) goto out_fallback; } else { cache_neg_hit_finish(ncp); vfs_smr_exit(); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); out_fallback: return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); } struct celockstate { struct mtx *vlp[3]; struct mtx *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, struct mtx *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { mtx_lock(blp1); cel->blp[0] = blp1; } mtx_lock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) mtx_unlock(cel->blp[0]); mtx_unlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } atomic_store_ptr(&dvp->v_cache_dd, NULL); cache_enter_unlock(&cel); if (ncp != NULL) cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts; struct nchashhead *ncpp; uint32_t hash; int flag; int len; KASSERT(cnp->cn_namelen <= NAME_MAX, ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, NAME_MAX)); VNPASS(!VN_IS_DOOMED(dvp), dvp); VNPASS(dvp->v_type != VNON, dvp); if (vp != NULL) { VNPASS(!VN_IS_DOOMED(vp), vp); VNPASS(vp->v_type != VNON, vp); } if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { KASSERT(dvp == vp, ("%s: different vnodes for dot entry (%p; %p)\n", __func__, dvp, vp)); } else { KASSERT(dvp != vp, ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, cnp->cn_nameptr, dvp)); } #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); if (ncp == NULL) return; cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_neg_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); CK_SLIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { MPASS(cache_ncp_canuse(n2)); if ((n2->nc_flag & NCF_NEGATIVE) != 0) KASSERT(vp == NULL, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, NULL, vp, cnp->cn_nameptr)); else KASSERT(n2->nc_vp == vp, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, n2->nc_vp, vp, cnp->cn_nameptr)); /* * Entries are supposed to be immutable unless in the * process of getting destroyed. Accommodating for * changing timestamps is possible but not worth it. * This should be harmless in terms of correctness, in * the worst case resulting in an earlier expiration. * Alternatively, the found entry can be replaced * altogether. */ MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); #if 0 if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } #endif SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, vp); goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); atomic_thread_fence_rel(); atomic_store_ptr(&dvp->v_cache_dd, ncp); } if (vp != NULL) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } atomic_thread_fence_rel(); atomic_store_ptr(&vp->v_cache_dd, ncp); } else if (vp->v_type != VDIR) { if (vp->v_cache_dd != NULL) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { cache_hold_vnode(dvp); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); cache_neg_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (ndd != NULL) cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } /* * A variant of the above accepting flags. * * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. * * TODO: this routine is a hack. It blindly removes the old entry, even if it * happens to match and it is doing it in an inefficient manner. It was added * to accommodate NFS which runs into a case where the target for a given name * may change from under it. Note this does nothing to solve the following * race: 2 callers of cache_enter_time_flags pass a different target vnode for * the same [dvp, cnp]. It may be argued that code doing this is broken. */ void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags) { MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); if (flags & VFS_CACHE_DROPOLD) cache_remove_cnp(dvp, cnp); cache_enter_time(dvp, vp, cnp, tsp, dtsp); } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; cache_recalc_neg_min(ncnegminpct); nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); TAILQ_INIT(&neglists[i].nl_hotlist); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } /* * Induce transient cache misses for lockless operation in cache_lookup() by * using a temporary hash table. * * This will force a fs lookup. * * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time * to observe all CPUs not performing the lookup. */ static void cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) { MPASS(temphash < nchash); /* * Change the size. The new size is smaller and can safely be used * against the existing table. All lookups which now hash wrong will * result in a cache miss, which all callers are supposed to know how * to handle. */ atomic_store_long(&nchash, temphash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated hash value, but they still * see the old table. */ atomic_store_ptr(&nchashtbl, temptbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } /* * Set the new hash table. * * Similarly to cache_changesize_set_temp(), this has to synchronize against * lockless operation in cache_lookup(). */ static void cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) { MPASS(nchash < new_hash); /* * Change the pointer first. This wont result in out of bounds access * since the temporary table is guaranteed to be smaller. */ atomic_store_ptr(&nchashtbl, new_tbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated pointer value, but they * still see the old size. */ atomic_store_long(&nchash, new_hash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; u_long new_nchash, old_nchash, temphash; struct namecache *ncp; uint32_t hash; u_long newncsize; int i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } temptbl = nchinittbl(1, &temphash); /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; cache_changesize_set_temp(temptbl, temphash); for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); } } ncsize = newncsize; cache_recalc_neg_min(ncnegminpct); cache_changesize_set_new(new_nchashtbl, new_nchash); cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); ncfreetbl(temptbl); } /* * Remove all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp; struct mtx *vlp, *vlp2; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); cache_free_batch(&batch); } /* * Opportunistic check to see if there is anything to do. */ static bool cache_has_entries(struct vnode *vp) { if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && atomic_load_ptr(&vp->v_cache_dd) == NULL) return (false); return (true); } void cache_purge(struct vnode *vp) { SDT_PROBE1(vfs, namecache, purge, done, vp); if (!cache_has_entries(vp)) return; cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } /* * Serialize against a potential thread doing cache_purge. */ vlp = VP2VNODELOCK(vp); mtx_wait_unlocked(vlp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } return; } /* * Remove all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp, *nnp; struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } mtx_unlock(vlp); cache_free_batch(&batch); } /* * Entry points for modifying VOP operations. */ void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { ASSERT_VOP_IN_SEQC(fdvp); ASSERT_VOP_IN_SEQC(fvp); ASSERT_VOP_IN_SEQC(tdvp); if (tvp != NULL) ASSERT_VOP_IN_SEQC(tvp); cache_purge(fvp); if (tvp != NULL) { cache_purge(tvp); KASSERT(!cache_remove_cnp(tdvp, tcnp), ("%s: lingering negative entry", __func__)); } else { cache_remove_cnp(tdvp, tcnp); } /* * TODO * * Historically renaming was always purging all revelang entries, * but that's quite wasteful. In particular turns out that in many cases * the target file is immediately accessed after rename, inducing a cache * miss. * * Recode this to reduce relocking and reuse the existing entry (if any) * instead of just removing it above and allocating a new one here. */ cache_enter(tdvp, fvp, tcnp); } void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { ASSERT_VOP_IN_SEQC(dvp); ASSERT_VOP_IN_SEQC(vp); cache_purge(vp); } #ifdef INVARIANTS /* * Validate that if an entry exists it matches. */ void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); if (CK_SLIST_EMPTY(NCHHASH(hash))) return; blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { if (ncp->nc_vp != vp) panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); } } mtx_unlock(blp); } void cache_assert_no_entries(struct vnode *vp) { VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); VNPASS(vp->v_cache_dd == NULL, vp); } #endif /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp) { struct vnode *vp, *mvp; size_t visited, purged; visited = purged = 0; /* * Somewhat wasteful iteration over all vnodes. Would be better to * support filtering and avoid the interlock to begin with. */ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { visited++; if (!cache_has_entries(vp)) { VI_UNLOCK(vp); continue; } vholdl(vp); VI_UNLOCK(vp); cache_purge(vp); purged++; vdrop(vp); } SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } /* * Canonicalize a path by walking it forward and back. * * BUGS: * - Nothing guarantees the integrity of the entire chain. Consider the case * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of * "foo" into "quux" during the backwards walk. The result will be * "quux/bar/baz/qux", which could not have been obtained by an incremental * walk in userspace. Moreover, the path we return is inaccessible if the * calling thread lacks permission to traverse "quux". */ static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) return (error); error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); if (error == 0) { error = copyout(retbuf, buf, size); free(freebuf, M_TEMP); } - NDFREE(&nd, 0); + vrele(nd.ni_vp); + vrele(nd.ni_dvp); + NDFREE_PNBUF(&nd); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } static struct namecache * vn_dd_from_dst(struct vnode *vp) { struct namecache *ncp; cache_assert_vnode_locked(vp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) return (ncp); } return (NULL); } int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); ncp = (*vp)->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { KASSERT(ncp == vn_dd_from_dst(*vp), ("%s: mismatch for dd entry (%p != %p)", __func__, ncp, vn_dd_from_dst(*vp))); } else { ncp = vn_dd_from_dst(*vp); } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; bool slash_prefixed; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; slash_prefixed = true; if (addend == 0) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; slash_prefixed = false; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static void __inline cache_rev_failed_impl(int *reason, int line) { *reason = line; } #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *tvp; struct mount *mp; struct namecache *ncp; size_t orig_buflen; int reason; int error; #ifdef KDTRACE_HOOKS int i; #endif seqc_t vp_seqc, tvp_seqc; u_char nc_flag; VFS_SMR_ASSERT_ENTERED(); if (!atomic_load_char(&cache_fast_lookup_enabled)) { vfs_smr_exit(); return (-1); } orig_buflen = *buflen; if (addend == 0) { MPASS(*buflen >= 2); *buflen -= 1; buf[*buflen] = '\0'; } if (vp == rdir || vp == rootvnode) { if (addend == 0) { *buflen -= 1; buf[*buflen] = '/'; } goto out_ok; } #ifdef KDTRACE_HOOKS i = 0; #endif error = -1; ncp = NULL; /* for sdt probe down below */ vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } for (;;) { #ifdef KDTRACE_HOOKS i++; #endif if ((vp->v_vflag & VV_ROOT) != 0) { mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) { cache_rev_failed(&reason); goto out_abort; } tvp = atomic_load_ptr(&mp->mnt_vnodecovered); tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; continue; } ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) { cache_rev_failed(&reason); goto out_abort; } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { cache_rev_failed(&reason); goto out_abort; } if (ncp->nc_nlen >= *buflen) { cache_rev_failed(&reason); error = ENOMEM; goto out_abort; } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); *buflen -= 1; buf[*buflen] = '/'; tvp = ncp->nc_dvp; tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { cache_rev_failed(&reason); goto out_abort; } if (!cache_ncp_canuse(ncp)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; if (vp == rdir || vp == rootvnode) break; } out_ok: vfs_smr_exit(); *retbuf = buf + *buflen; *buflen = orig_buflen - *buflen + addend; SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); return (0); out_abort: *buflen = orig_buflen; SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); vfs_smr_exit(); return (error); } static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen, addend; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); addend = 0; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; addend = orig_buflen - *buflen; } return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * * Since the namecache does not track hardlinks, the caller is expected to * first look up the target vnode with WANTPARENT flag passed to namei to get * dvp and vp. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * following names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ int vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, const char *hrdl_name, size_t hrdl_name_length, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; size_t addend; int error; enum vtype type; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; buf = malloc(*buflen, M_TEMP, M_WAITOK); addend = 0; /* * Check for VBAD to work around the vp_crossmp bug in lookup(). * * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be * set to mount point's root vnode while ni_dvp will be vp_crossmp. * If the type is VDIR (like in this very case) we can skip looking * at ni_dvp in the first place. However, since vnodes get passed here * unlocked the target may transition to doomed state (type == VBAD) * before we get to evaluate the condition. If this happens, we will * populate part of the buffer and descend to vn_fullpath_dir with * vp == vp_crossmp. Prevent the problem by checking for VBAD. * * This should be atomic_load(&vp->v_type) but it is illegal to take * an address of a bit field, even if said field is sized to char. * Work around the problem by reading the value into a full-sized enum * and then re-reading it with atomic_load which will still prevent * the compiler from re-reading down the road. */ type = vp->v_type; type = atomic_load_int(&type); if (type == VBAD) { error = ENOENT; goto out_bad; } if (type != VDIR) { addend = hrdl_name_length + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); tmpbuf[addend - 1] = '\0'; vp = dvp; } vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); vref(vp); error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); pwd_drop(pwd); } if (error != 0) goto out_bad; *freebuf = buf; return (0); out_bad: free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif static int cache_fast_lookup = 1; #define CACHE_FPL_FAILED -2020 void cache_fast_lookup_enabled_recalc(void) { int lookup_flag; int mac_on; #ifdef MAC mac_on = mac_vnode_check_lookup_enabled(); mac_on |= mac_vnode_check_readlink_enabled(); #else mac_on = 0; #endif lookup_flag = atomic_load_int(&cache_fast_lookup); if (lookup_flag && !mac_on) { atomic_store_char(&cache_fast_lookup_enabled, true); } else { atomic_store_char(&cache_fast_lookup_enabled, false); } } static int syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) { int error, old; old = atomic_load_int(&cache_fast_lookup); error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) cache_fast_lookup_enabled_recalc(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, cache_fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_outer { size_t ni_pathlen; int cn_flags; }; struct nameidata_saved { #ifdef INVARIANTS char *cn_nameptr; size_t ni_pathlen; #endif }; #ifdef INVARIANTS struct cache_fpl_debug { size_t ni_pathlen; }; #endif struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; char *nulchar; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; uint32_t hash; struct nameidata_saved snd; struct nameidata_outer snd_outer; int line; enum cache_fpl_status status:8; bool in_smr; bool fsearch; struct pwd **pwd; #ifdef INVARIANTS struct cache_fpl_debug debug; #endif }; static bool cache_fplookup_mp_supported(struct mount *mp); static bool cache_fplookup_is_mp(struct cache_fpl *fpl); static int cache_fplookup_cross_mount(struct cache_fpl *fpl); static int cache_fplookup_partial_setup(struct cache_fpl *fpl); static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); static int cache_fplookup_trailingslash(struct cache_fpl *fpl); static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; } static struct vnode * cache_fpl_handle_root(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } return (ndp->ni_rootdir); } static void cache_fpl_checkpoint_outer(struct cache_fpl *fpl) { fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; } static void cache_fpl_checkpoint(struct cache_fpl *fpl) { #ifdef INVARIANTS fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; #endif } static void cache_fpl_restore_partial(struct cache_fpl *fpl) { fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; #ifdef INVARIANTS fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; #endif } static void cache_fpl_restore_abort(struct cache_fpl *fpl) { cache_fpl_restore_partial(fpl); /* * It is 0 on entry by API contract. */ fpl->ndp->ni_resflags = 0; fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) static void cache_fpl_assert_status(struct cache_fpl *fpl) { switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_DESTROYED: case CACHE_FPL_STATUS_ABORTED: case CACHE_FPL_STATUS_PARTIAL: case CACHE_FPL_STATUS_HANDLED: break; } } #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #define cache_fpl_assert_status(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) static int __noinline cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; if (fpl->in_smr) cache_fpl_smr_exit(fpl); cache_fpl_restore_abort(fpl); /* * Resolving symlinks overwrites data passed by the caller. * Let namei know. */ if (ndp->ni_loopcnt > 0) { fpl->status = CACHE_FPL_STATUS_DESTROYED; cache_fpl_cleanup_cnp(cnp); } return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int __noinline cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (cache_fplookup_partial_setup(fpl)); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (0); } #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) static int cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); MPASS(error != 0); MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; fpl->dvp = NULL; fpl->tvp = NULL; return (error); } #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) static bool cache_fpl_terminated(struct cache_fpl *fpl) { return (fpl->status != CACHE_FPL_STATUS_UNSET); } #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | SAVESTART | WILLBEDIR | \ ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ OPENWRITE | WANTIOCTLCAPS) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_istrailingslash(struct cache_fpl *fpl) { MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); return (*(fpl->nulchar - 1) == '/'); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = curthread; if (!atomic_load_char(&cache_fast_lookup_enabled)) { cache_fpl_aborted_early(fpl); return (false); } if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted_early(fpl); return (false); } if (IN_CAPABILITY_MODE(td)) { cache_fpl_aborted_early(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted_early(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted_early(fpl); return (false); } return (true); } static int __noinline cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) { struct nameidata *ndp; struct componentname *cnp; int error; bool fsearch; ndp = fpl->ndp; cnp = fpl->cnp; error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } fpl->fsearch = fsearch; if ((*vpp)->v_type != VDIR) { if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } } return (0); } static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct vnode *dvp; cnp = fpl->cnp; dvp = fpl->dvp; cache_fpl_smr_exit(fpl); if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) return (cache_fpl_handled_error(fpl, ENOENT)); else return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; pwd = *(fpl->pwd); dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; if (!pwd_hold_smr(pwd)) { return (cache_fpl_aborted(fpl)); } /* * Note that seqc is checked before the vnode is locked, so by * the time regular lookup gets to it it may have moved. * * Ultimately this does not affect correctness, any lookup errors * are userspace racing with itself. It is guaranteed that any * path which ultimately gets found could also have been found * by regular lookup going all the way in absence of concurrent * modifications. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore_partial(fpl); #ifdef INVARIANTS if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); } #endif ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; /* * Skip potential extra slashes parsing did not take care of. * cache_fplookup_skip_slashes explains the mechanism. */ if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl)); } /* * They want to possibly modify the state of the namecache. */ static int __noinline cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; struct mount *mp; seqc_t dvp_seqc; int error; bool docache; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS(*(cnp->cn_nameptr) != '/'); MPASS(cache_fpl_islastcn(ndp)); if ((cnp->cn_flags & LOCKPARENT) == 0) MPASS((cnp->cn_flags & WANTPARENT) != 0); MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME); MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) docache = false; /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { cache_fpl_smr_exit(fpl); /* * Original code keeps not checking for CREATE which * might be a bug. For now let the old lookup decide. */ if (cnp->cn_nameiop == CREATE) { return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled_error(fpl, EROFS)); } if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EEXIST)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. * * XXX At least UFS requires its lookup routine to be called for * the last path component, which leads to some level of complication * and inefficiency: * - the target routine always locks the target vnode, but our caller * may not need it locked * - some of the VOP machinery asserts that the parent is locked, which * once more may be not required * * TODO: add a flag for filesystems which don't need this. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_EXCLUSIVE); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; cnp->cn_flags |= ISLASTCN; if (docache) cnp->cn_flags |= MAKEENTRY; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; cnp->cn_lkflags = LK_EXCLUSIVE; error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { if ((cnp->cn_flags & SAVESTART) != 0) { ndp->ni_startdir = dvp; vrefact(ndp->ni_startdir); } MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } /* * There are very hairy corner cases concerning various flag combinations * and locking state. In particular here we only hold one lock instead of * two. * * Skip the complexity as it is of no significance for normal workloads. */ if (__predict_false(tvp == dvp)) { vput(dvp); vrele(tvp); return (cache_fpl_aborted(fpl)); } /* * If they want the symlink itself we are fine, but if they want to * follow it regular lookup has to be engaged. */ if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } /* * Since we expect this to be the terminal vnode it should almost never * be a mount point. */ if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & FAILIFEXISTS) != 0) { vput(dvp); vput(tvp); return (cache_fpl_handled_error(fpl, EEXIST)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } if ((cnp->cn_flags & SAVESTART) != 0) { ndp->ni_startdir = dvp; vrefact(ndp->ni_startdir); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; ndp = fpl->ndp; if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } return (cache_fplookup_final_modifying(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || fpl->status == CACHE_FPL_STATUS_DESTROYED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS(*(cnp->cn_nameptr) != '/'); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } /* * Comment from locked lookup: * Check for degenerate name (e.g. / or "") which is a way of talking about a * directory, e.g. like "/." or ".". */ static int __noinline cache_fplookup_degenerate(struct cache_fpl *fpl) { struct componentname *cnp; struct vnode *dvp; enum vgetstate dvs; int error, lkflags; #ifdef INVARIANTS char *cp; #endif fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; #ifdef INVARIANTS for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { KASSERT(*cp == '/', ("%s: encountered non-slash; string [%s]\n", __func__, cnp->cn_pnbuf)); } #endif if (__predict_false(cnp->cn_nameiop != LOOKUP)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EISDIR)); } MPASS((cnp->cn_flags & SAVESTART) == 0); if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { return (cache_fplookup_final_withparent(fpl)); } dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(dvp, lkflags, dvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_emptypath(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate tvs; struct vnode *tvp; int error, lkflags; fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; MPASS(*cnp->cn_pnbuf == '\0'); if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); tvs = vget_prep_smr(tvp); cache_fpl_smr_exit(fpl); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } ndp->ni_resflags |= NIRES_EMPTYPATH; return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_noentry(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); if (cnp->cn_nameiop == LOOKUP) MPASS((cnp->cn_flags & NOCACHE) == 0); MPASS(!cache_fpl_isdotdot(cnp)); /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } if (cnp->cn_nameptr[0] == '/') { return (cache_fplookup_skip_slashes(fpl)); } if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } if (cnp->cn_nameptr[0] == '\0') { if (fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } return (cache_fplookup_trailingslash(fpl)); } if (cnp->cn_nameiop != LOOKUP) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } MPASS((cnp->cn_flags & SAVESTART) == 0); /* * Only try to fill in the component if it is the last one, * otherwise not only there may be several to handle but the * walk may be complicated. */ if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_SHARED); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; /* * TODO: provide variants which don't require locking either vnode. */ cnp->cn_flags |= ISLASTCN | MAKEENTRY; cnp->cn_lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) { cnp->cn_lkflags = LK_EXCLUSIVE; } error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { int error; MPASS(!seqc_in_modify(fpl->dvp_seqc)); /* * Just re-assign the value. seqc will be checked later for the first * non-dot path component in line and/or before deciding to return the * vnode. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *dvp; struct prison *pr; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; MPASS(cache_fpl_isdotdot(cnp)); /* * XXX this is racy the same way regular lookup is */ for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dvp == pr->pr_root) break; if (dvp == ndp->ni_rootdir || dvp == ndp->ni_topdir || dvp == rootvnode || pr != NULL) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_partial(fpl)); } ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { return (cache_fpl_aborted(fpl)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_aborted(fpl)); } counter_u64_add(dotdothits, 1); return (0); } static int __noinline cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) { u_char nc_flag __diagused; bool neg_promote; #ifdef INVARIANTS nc_flag = atomic_load_char(&ncp->nc_flag); MPASS((nc_flag & NCF_NEGATIVE) != 0); #endif /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } neg_promote = cache_neg_hit_prep(ncp); if (!cache_fpl_neg_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (neg_promote) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } cache_neg_hit_finish(ncp); cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } /* * Resolve a symlink. Called by filesystem-specific routines. * * Code flow is: * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve */ int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) { struct nameidata *ndp; struct componentname *cnp; size_t adjust; ndp = fpl->ndp; cnp = fpl->cnp; if (__predict_false(len == 0)) { return (ENOENT); } if (__predict_false(len > MAXPATHLEN - 2)) { if (cache_fpl_istrailingslash(fpl)) { return (EAGAIN); } } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { return (ENAMETOOLONG); } if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { return (ELOOP); } adjust = len; if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); } else { if (cache_fpl_istrailingslash(fpl)) { adjust = len + 1; cnp->cn_pnbuf[len] = '/'; cnp->cn_pnbuf[len + 1] = '\0'; } else { cnp->cn_pnbuf[len] = '\0'; } } bcopy(string, cnp->cn_pnbuf, len); ndp->ni_pathlen += adjust; cache_fpl_pathlen_add(fpl, adjust); cnp->cn_nameptr = cnp->cn_pnbuf; fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl->tvp = NULL; return (0); } static int __noinline cache_fplookup_symlink(struct cache_fpl *fpl) { struct mount *mp; struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp, *tvp; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; tvp = fpl->tvp; if (cache_fpl_islastcn(ndp)) { if ((cnp->cn_flags & FOLLOW) == 0) { return (cache_fplookup_final(fpl)); } } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } /* * Note this check races against setting the flag just like regular * lookup. */ if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EACCES)); } error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); if (__predict_false(error != 0)) { switch (error) { case EAGAIN: return (cache_fpl_partial(fpl)); case ENOENT: case ENAMETOOLONG: case ELOOP: cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, error)); default: return (cache_fpl_aborted(fpl)); } } if (*(cnp->cn_nameptr) == '/') { fpl->dvp = cache_fpl_handle_root(fpl); fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } /* * The main loop assumes that ->dvp points to a vnode belonging * to a filesystem which can do lockless lookup, but the absolute * symlink can be wandering off to one which does not. */ mp = atomic_load_ptr(&fpl->dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (!cache_fplookup_mp_supported(mp)) { cache_fpl_checkpoint(fpl); return (cache_fpl_partial(fpl)); } } return (0); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; int error; cnp = fpl->cnp; dvp = fpl->dvp; hash = fpl->hash; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) { return (cache_fplookup_dot(fpl)); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { return (cache_fplookup_dotdot(fpl)); } } MPASS(!cache_fpl_isdotdot(cnp)); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { return (cache_fplookup_noentry(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { return (cache_fplookup_neg(fpl, ncp, hash)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static bool cache_fplookup_mp_supported(struct mount *mp) { MPASS(mp != NULL); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct mount_pcpu *mpcpu, *prev_mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp, mpcpu)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } prev_mp = mp; prev_mpcpu = mpcpu; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static int __noinline cache_fplookup_cross_mount(struct cache_fpl *fpl) { struct mount *mp; struct mount_pcpu *mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } if (!vfs_op_thread_enter_crit(mp, mpcpu)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (__predict_false(vp == NULL)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); vfs_op_thread_exit_crit(mp, mpcpu); if (seqc_in_modify(vp_seqc)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp != NULL)) { /* * There are possibly more mount points on top. * Normally this does not happen so for simplicity just start * over. */ return (cache_fplookup_climb_mount(fpl)); } fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } /* * Check if a vnode is mounted on. */ static bool cache_fplookup_is_mp(struct cache_fpl *fpl) { struct vnode *vp; vp = fpl->tvp; return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); } /* * Parse the path. * * The code was originally copy-pasted from regular lookup and despite * clean ups leaves performance on the table. Any modifications here * must take into account that in case off fallback the resulting * nameidata state has to be compatible with the original. */ /* * Debug ni_pathlen tracking. */ #ifdef INVARIANTS static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen += n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen -= n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { cache_fpl_pathlen_add(fpl, 1); } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { cache_fpl_pathlen_sub(fpl, 1); } #else static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { } #endif static void cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp; char *cp; uint32_t hash; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * Find the end of this path component, it is either / or nul. * * Store / as a temporary sentinel so that we only have one character * to test for. Pathnames tend to be short so this should not be * resulting in cache misses. * * TODO: fix this to be word-sized. */ MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], fpl->nulchar, cnp->cn_pnbuf)); KASSERT(*fpl->nulchar == '\0', ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, cnp->cn_pnbuf)); hash = cache_get_hash_iter_start(dvp); *fpl->nulchar = '/'; for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { KASSERT(*cp != '\0', ("%s: encountered unexpected nul; string [%s]\n", __func__, cnp->cn_nameptr)); hash = cache_get_hash_iter(*cp, hash); continue; } *fpl->nulchar = '\0'; fpl->hash = cache_get_hash_iter_finish(hash); cnp->cn_namelen = cp - cnp->cn_nameptr; cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); #ifdef INVARIANTS /* * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since * we are going to fail this lookup with ENAMETOOLONG (see below). */ if (cnp->cn_namelen <= NAME_MAX) { if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { panic("%s: mismatched hash for [%s] len %ld", __func__, cnp->cn_nameptr, cnp->cn_namelen); } } #endif /* * Hack: we have to check if the found path component's length exceeds * NAME_MAX. However, the condition is very rarely true and check can * be elided in the common case -- if an entry was found in the cache, * then it could not have been too long to begin with. */ ndp->ni_next = cp; } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; KASSERT(*(cnp->cn_nameptr) == '/', ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } /* * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. * * Lockless lookup tries to elide checking for spurious slashes and should they * be present is guaranteed to fail to find an entry. In this case the caller * must check if the name starts with a slash and call this routine. It is * going to fast forward across the spurious slashes and set the state up for * retry. */ static int __noinline cache_fplookup_skip_slashes(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); /* * Go back to one slash so that cache_fplookup_parse_advance has * something to skip. */ cnp->cn_nameptr--; cache_fpl_pathlen_inc(fpl); /* * cache_fplookup_parse_advance starts from ndp->ni_next */ ndp->ni_next = cnp->cn_nameptr; /* * See cache_fplookup_dot. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; return (0); } /* * Handle trailing slashes (e.g., "foo/"). * * If a trailing slash is found the terminal vnode must be a directory. * Regular lookup shortens the path by nulifying the first trailing slash and * sets the TRAILINGSLASH flag to denote this took place. There are several * checks on it performed later. * * Similarly to spurious slashes, lockless lookup handles this in a speculative * manner relying on an invariant that a non-directory vnode will get a miss. * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. * * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" * and denotes this is the last path component, which avoids looping back. * * Only plain lookups are supported for now to restrict corner cases to handle. */ static int __noinline cache_fplookup_trailingslash(struct cache_fpl *fpl) { #ifdef INVARIANTS size_t ni_pathlen; #endif struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *tvp; char *cn_nameptr_orig, *cn_nameptr_slash; seqc_t tvp_seqc; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; MPASS(fpl->dvp == fpl->tvp); KASSERT(cache_fpl_istrailingslash(fpl), ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, cnp->cn_pnbuf)); KASSERT(cnp->cn_nameptr[0] == '\0', ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], cnp->cn_pnbuf)); KASSERT(cnp->cn_namelen == 0, ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, cnp->cn_pnbuf)); MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); if (cnp->cn_nameiop != LOOKUP) { return (cache_fpl_aborted(fpl)); } if (__predict_false(tvp->v_type != VDIR)) { if (!vn_seqc_consistent(tvp, tvp_seqc)) { return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Denote the last component. */ ndp->ni_next = &cnp->cn_nameptr[0]; MPASS(cache_fpl_islastcn(ndp)); /* * Unwind trailing slashes. */ cn_nameptr_orig = cnp->cn_nameptr; while (cnp->cn_nameptr >= cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] != '/') { break; } } /* * Unwind to the beginning of the path component. * * Note the path may or may not have started with a slash. */ cn_nameptr_slash = cnp->cn_nameptr; while (cnp->cn_nameptr > cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] == '/') { break; } } if (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; } cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); cache_fpl_checkpoint(fpl); #ifdef INVARIANTS ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; if (ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif /* * If this was a "./" lookup the parent directory is already correct. */ if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { return (0); } /* * Otherwise we need to look it up. */ tvp = fpl->tvp; ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); if (__predict_false(ncp == NULL)) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { return (cache_fpl_aborted(fpl)); } fpl->dvp = ncp->nc_dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { struct componentname *cnp; struct vnode *dvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; /* * Hack: delayed empty path checking. */ if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } /* * TODO: Due to ignoring trailing slashes lookup will perform a * permission check on the last dir when it should not be doing it. It * may fail, but said failure should be ignored. It is possible to fix * it up fully without resorting to regular lookup, but for now just * abort. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_aborted(fpl)); } /* * Hack: delayed degenerate path checking. */ if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } /* * Hack: they may be looking up foo/bar, where foo is not a directory. * In such a case we need to return ENOTDIR, but we may happen to get * here with a different error. */ if (dvp->v_type != VDIR) { error = ENOTDIR; } /* * Hack: handle O_SEARCH. * * Open Group Base Specifications Issue 7, 2018 edition states: * * If the access mode of the open file description associated with the * file descriptor is not O_SEARCH, the function shall check whether * directory searches are permitted using the current permissions of * the directory underlying the file descriptor. If the access mode is * O_SEARCH, the function shall not perform the check. * * * Regular lookup tests for the NOEXECCHECK flag for every path * component to decide whether to do the permission check. However, * since most lookups never have the flag (and when they do it is only * present for the first path component), lockless lookup only acts on * it if there is a permission problem. Here the flag is represented * with a boolean so that we don't have to clear it on the way out. * * For simplicity this always aborts. * TODO: check if this is the first lookup and ignore the permission * problem. Note the flag has to survive fallback (if it happens to be * performed). */ if (fpl->fsearch) { return (cache_fpl_aborted(fpl)); } switch (error) { case EAGAIN: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_partial(fpl); } break; default: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled_error(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl); /* * The vnode at hand is almost always stable, skip checking for it. * Worst case this postpones the check towards the end of the iteration * of the main loop. */ fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { return (cache_fpl_aborted(fpl)); } MPASS(fpl->tvp == NULL); for (;;) { cache_fplookup_parse(fpl); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } error = cache_fplookup_next(fpl); if (__predict_false(cache_fpl_terminated(fpl))) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (fpl->tvp->v_type == VLNK) { error = cache_fplookup_symlink(fpl); if (cache_fpl_terminated(fpl)) { break; } } else { if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); } cache_fpl_checkpoint(fpl); } return (error); } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; int error; fpl.status = CACHE_FPL_STATUS_UNSET; fpl.in_smr = false; fpl.ndp = ndp; fpl.cnp = cnp = &ndp->ni_cnd; MPASS(ndp->ni_lcf == 0); KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, ("%s: internal flags found in cn_flags %" PRIx64, __func__, cnp->cn_flags)); if ((cnp->cn_flags & SAVESTART) != 0) { MPASS(cnp->cn_nameiop != LOOKUP); } MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); if (__predict_false(!cache_can_fplookup(&fpl))) { *status = fpl.status; SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); return (EOPNOTSUPP); } cache_fpl_checkpoint_outer(&fpl); cache_fpl_smr_enter_initial(&fpl); #ifdef INVARIANTS fpl.debug.ni_pathlen = ndp->ni_pathlen; #endif fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl.fsearch = false; fpl.tvp = NULL; /* for degenerate path handling */ fpl.pwd = pwdp; pwd = pwd_get_smr(); *(fpl.pwd) = pwd; ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { dvp = cache_fpl_handle_root(&fpl); MPASS(ndp->ni_resflags == 0); ndp->ni_resflags = NIRES_ABS; } else { if (ndp->ni_dirfd == AT_FDCWD) { dvp = pwd->pwd_cdir; } else { error = cache_fplookup_dirfd(&fpl, &dvp); if (__predict_false(error != 0)) { goto out; } } } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; if (SDT_PROBES_ENABLED()) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (fpl.status == CACHE_FPL_STATUS_HANDLED) SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, ndp); } if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); if (error != 0) { cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); MPASS(fpl.tvp == NULL); } ndp->ni_dvp = fpl.dvp; ndp->ni_vp = fpl.tvp; } return (error); } diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c index d389b33d0634..d190ec60205a 100644 --- a/sys/kern/vfs_extattr.c +++ b/sys/kern/vfs_extattr.c @@ -1,755 +1,757 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999-2001 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int kern_extattr_set_path(struct thread *td, const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes, int follow); static int kern_extattr_get_path(struct thread *td, const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes, int follow); static int kern_extattr_delete_path(struct thread *td, const char *path, int attrnamespace, const char *attrname, int follow); static int kern_extattr_list_path(struct thread *td, const char *path, int attrnamespace, void *data, size_t nbytes, int follow); /* * Syscall to push extended attribute configuration information into the VFS. * Accepts a path, which it converts to a mountpoint, as well as a command * (int cmd), and attribute name and misc data. * * Currently this is used only by UFS1 extended attributes. */ #ifndef _SYS_SYSPROTO_H_ struct extattrctl_args { const char *path; int cmd; const char *filename; int attrnamespace; const char *attrname; }; #endif int sys_extattrctl(struct thread *td, struct extattrctl_args *uap) { struct vnode *filename_vp; struct nameidata nd; struct mount *mp, *mp_writable; char attrname[EXTATTR_MAXNAMELEN + 1]; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_VALUE(uap->attrnamespace); /* * uap->attrname is not always defined. We check again later when we * invoke the VFS call so as to pass in NULL there if needed. */ if (uap->attrname != NULL) { error = copyinstr(uap->attrname, attrname, sizeof(attrname), NULL); if (error) return (error); } AUDIT_ARG_TEXT(attrname); mp = NULL; filename_vp = NULL; if (uap->filename != NULL) { NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2, UIO_USERSPACE, uap->filename); error = namei(&nd); if (error) return (error); filename_vp = nd.ni_vp; - NDFREE(&nd, NDF_NO_VP_RELE); + NDFREE_PNBUF(&nd); } /* uap->path is always defined. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path); error = namei(&nd); if (error) goto out; mp = nd.ni_vp->v_mount; error = vfs_busy(mp, 0); if (error) { - NDFREE(&nd, 0); + vput(nd.ni_vp); + NDFREE_PNBUF(&nd); mp = NULL; goto out; } VOP_UNLOCK(nd.ni_vp); error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | V_PCATCH); - NDFREE(&nd, NDF_NO_VP_UNLOCK); + vrele(nd.ni_vp); + NDFREE_PNBUF(&nd); if (error) goto out; if (filename_vp != NULL) { /* * uap->filename is not always defined. If it is, * grab a vnode lock, which VFS_EXTATTRCTL() will * later release. */ error = vn_lock(filename_vp, LK_EXCLUSIVE); if (error) { vn_finished_write(mp_writable); goto out; } } error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace, uap->attrname != NULL ? attrname : NULL); vn_finished_write(mp_writable); out: if (mp != NULL) vfs_unbusy(mp); /* * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp, * so vrele it if it is defined. */ if (filename_vp != NULL) vrele(filename_vp); return (error); } /*- * Set a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct mount *mp; struct uio auio; struct iovec aiov; ssize_t cnt; int error; if (nbytes > IOSIZE_MAX) return (EINVAL); error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; cnt = nbytes; #ifdef MAC error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, td->td_ucred, td); cnt -= auio.uio_resid; td->td_retval[0] = cnt; #ifdef MAC done: #endif VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_set_fd_args { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_set_fd(struct thread *td, struct extattr_set_fd_args *uap) { struct file *fp; char attrname[EXTATTR_MAXNAMELEN + 1]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, sizeof(attrname), NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode_path(td, uap->fd, cap_rights_init_one(&rights, CAP_EXTATTR_SET), &fp); if (error) return (error); error = extattr_set_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_set_file_args { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_set_file(struct thread *td, struct extattr_set_file_args *uap) { return (kern_extattr_set_path(td, uap->path, uap->attrnamespace, uap->attrname, uap->data, uap->nbytes, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct extattr_set_link_args { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_set_link(struct thread *td, struct extattr_set_link_args *uap) { return (kern_extattr_set_path(td, uap->path, uap->attrnamespace, uap->attrname, uap->data, uap->nbytes, NOFOLLOW)); } static int kern_extattr_set_path(struct thread *td, const char *path, int attrnamespace, const char *uattrname, void *data, size_t nbytes, int follow) { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN + 1]; int error; AUDIT_ARG_VALUE(attrnamespace); error = copyinstr(uattrname, attrname, sizeof(attrname), NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error) return (error); NDFREE_PNBUF(&nd); error = extattr_set_vp(nd.ni_vp, attrnamespace, attrname, data, nbytes, td); vrele(nd.ni_vp); return (error); } /*- * Get a named extended attribute on a file or directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", userspace buffer * pointer "data", buffer length "nbytes", thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; struct iovec aiov; ssize_t cnt; size_t size, *sizep; int error; if (nbytes > IOSIZE_MAX) return (EINVAL); vn_lock(vp, LK_SHARED | LK_RETRY); /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the maximum * read length. */ auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; #ifdef MAC error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, td->td_ucred, td); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; #ifdef MAC done: #endif VOP_UNLOCK(vp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_get_fd_args { int fd; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_get_fd(struct thread *td, struct extattr_get_fd_args *uap) { struct file *fp; char attrname[EXTATTR_MAXNAMELEN + 1]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, sizeof(attrname), NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode_path(td, uap->fd, cap_rights_init_one(&rights, CAP_EXTATTR_GET), &fp); if (error) return (error); error = extattr_get_vp(fp->f_vnode, uap->attrnamespace, attrname, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_get_file_args { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_get_file(struct thread *td, struct extattr_get_file_args *uap) { return (kern_extattr_get_path(td, uap->path, uap->attrnamespace, uap->attrname, uap->data, uap->nbytes, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct extattr_get_link_args { const char *path; int attrnamespace; const char *attrname; void *data; size_t nbytes; }; #endif int sys_extattr_get_link(struct thread *td, struct extattr_get_link_args *uap) { return (kern_extattr_get_path(td, uap->path, uap->attrnamespace, uap->attrname, uap->data, uap->nbytes, NOFOLLOW)); } static int kern_extattr_get_path(struct thread *td, const char *path, int attrnamespace, const char *uattrname, void *data, size_t nbytes, int follow) { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN + 1]; int error; AUDIT_ARG_VALUE(attrnamespace); error = copyinstr(uattrname, attrname, sizeof(attrname), NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error) return (error); NDFREE_PNBUF(&nd); error = extattr_get_vp(nd.ni_vp, attrnamespace, attrname, data, nbytes, td); vrele(nd.ni_vp); return (error); } /* * extattr_delete_vp(): Delete a named extended attribute on a file or * directory * * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", * kernelspace string pointer "attrname", proc "p" * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace, attrname); if (error) goto done; #endif error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, td->td_ucred, td); #ifdef MAC done: #endif VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_delete_fd_args { int fd; int attrnamespace; const char *attrname; }; #endif int sys_extattr_delete_fd(struct thread *td, struct extattr_delete_fd_args *uap) { struct file *fp; char attrname[EXTATTR_MAXNAMELEN + 1]; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = copyinstr(uap->attrname, attrname, sizeof(attrname), NULL); if (error) return (error); AUDIT_ARG_TEXT(attrname); error = getvnode_path(td, uap->fd, cap_rights_init_one(&rights, CAP_EXTATTR_DELETE), &fp); if (error) return (error); error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace, attrname, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_delete_file_args { const char *path; int attrnamespace; const char *attrname; }; #endif int sys_extattr_delete_file(struct thread *td, struct extattr_delete_file_args *uap) { return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace, uap->attrname, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct extattr_delete_link_args { const char *path; int attrnamespace; const char *attrname; }; #endif int sys_extattr_delete_link(struct thread *td, struct extattr_delete_link_args *uap) { return (kern_extattr_delete_path(td, uap->path, uap->attrnamespace, uap->attrname, NOFOLLOW)); } static int kern_extattr_delete_path(struct thread *td, const char *path, int attrnamespace, const char *uattrname, int follow) { struct nameidata nd; char attrname[EXTATTR_MAXNAMELEN + 1]; int error; AUDIT_ARG_VALUE(attrnamespace); error = copyinstr(uattrname, attrname, sizeof(attrname), NULL); if (error) return(error); AUDIT_ARG_TEXT(attrname); NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error) return(error); NDFREE_PNBUF(&nd); error = extattr_delete_vp(nd.ni_vp, attrnamespace, attrname, td); vrele(nd.ni_vp); return(error); } /*- * Retrieve a list of extended attributes on a file or directory. * * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace", * userspace buffer pointer "data", buffer length "nbytes", * thread "td". * Returns: 0 on success, an error number otherwise * Locks: none * References: vp must be a valid reference for the duration of the call */ static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data, size_t nbytes, struct thread *td) { struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; ssize_t cnt; int error; if (nbytes > IOSIZE_MAX) return (EINVAL); auiop = NULL; sizep = NULL; cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_resid = nbytes; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; cnt = nbytes; } else sizep = &size; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef MAC error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); if (error) { VOP_UNLOCK(vp); return (error); } #endif error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, td->td_ucred, td); VOP_UNLOCK(vp); if (auiop != NULL) { cnt -= auio.uio_resid; td->td_retval[0] = cnt; } else td->td_retval[0] = size; return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_list_fd_args { int fd; int attrnamespace; void *data; size_t nbytes; }; #endif int sys_extattr_list_fd(struct thread *td, struct extattr_list_fd_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_VALUE(uap->attrnamespace); error = getvnode_path(td, uap->fd, cap_rights_init_one(&rights, CAP_EXTATTR_LIST), &fp); if (error) return (error); error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data, uap->nbytes, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct extattr_list_file_args { const char *path; int attrnamespace; void *data; size_t nbytes; } #endif int sys_extattr_list_file(struct thread *td, struct extattr_list_file_args *uap) { return (kern_extattr_list_path(td, uap->path, uap->attrnamespace, uap->data, uap->nbytes, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct extattr_list_link_args { const char *path; int attrnamespace; void *data; size_t nbytes; }; #endif int sys_extattr_list_link(struct thread *td, struct extattr_list_link_args *uap) { return (kern_extattr_list_path(td, uap->path, uap->attrnamespace, uap->data, uap->nbytes, NOFOLLOW)); } static int kern_extattr_list_path(struct thread *td, const char *path, int attrnamespace, void *data, size_t nbytes, int follow) { struct nameidata nd; int error; AUDIT_ARG_VALUE(attrnamespace); NDINIT(&nd, LOOKUP, follow | AUDITVNODE1, UIO_USERSPACE, path); error = namei(&nd); if (error) return (error); NDFREE_PNBUF(&nd); error = extattr_list_vp(nd.ni_vp, attrnamespace, data, nbytes, td); vrele(nd.ni_vp); return (error); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 4aea0f263e46..63a30cbbdb3d 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,7083 +1,7084 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vn_seqc_init(struct vnode *); static void vn_seqc_write_end_free(struct vnode *vp); static void vgonel(struct vnode *); static bool vhold_recycle_free(struct vnode *); static void vdropl_recycle(struct vnode *vp); static void vdrop_recycle(struct vnode *vp); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: - NDFREE(&nd, 0); + vput(vp); + NDFREE_PNBUF(&nd); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ #define vnsz2log 8 #ifndef DEBUG_LOCKS _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && sizeof(struct vnode) < 1UL << (vnsz2log + 1), "vnsz2log needs to be updated"); #endif /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } #ifdef KASAN static int vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) { kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); return (0); } static void vnode_dtor(void *mem, int size, void *arg __unused) { size_t end1, end2, off1, off2; _Static_assert(offsetof(struct vnode, v_vnodelist) < offsetof(struct vnode, v_dbatchcpu), "KASAN marks require updating"); off1 = offsetof(struct vnode, v_vnodelist); off2 = offsetof(struct vnode, v_dbatchcpu); end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); /* * Access to the v_vnodelist and v_dbatchcpu fields are permitted even * after the vnode has been freed. Try to get some KASAN coverage by * marking everything except those two fields as invalid. Because * KASAN's tracking is not byte-granular, any preceding fields sharing * the same 8-byte aligned word must also be marked valid. */ /* Handle the area from the start until v_vnodelist... */ off1 = rounddown2(off1, KASAN_SHADOW_SCALE); kasan_mark(mem, off1, off1, KASAN_UMA_FREED); /* ... then the area between v_vnodelist and v_dbatchcpu ... */ off1 = roundup2(end1, KASAN_SHADOW_SCALE); off2 = rounddown2(off2, KASAN_SHADOW_SCALE); if (off2 > off1) kasan_mark((void *)((char *)mem + off1), off2 - off1, off2 - off1, KASAN_UMA_FREED); /* ... and finally the area from v_dbatchcpu to the end. */ off2 = roundup2(end2, KASAN_SHADOW_SCALE); kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, KASAN_UMA_FREED); } #endif /* KASAN */ /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; /* * Check vhold_recycle_free for an explanation. */ vp->v_holdcnt = VHOLD_NO_SMR; vp->v_type = VNON; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); kasan_mark(mem, size, size, 0); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; uma_ctor ctor; uma_dtor dtor; int cpu, physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); #ifdef KASAN ctor = vnode_ctor; dtor = vnode_dtor; #else ctor = NULL; dtor = NULL; #endif vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); uma_zone_set_smr(vnode_zone, vfs_smr); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * 1. VOP_LOOKUP() obtains B while A is held * 2. vfs_busy() obtains a shared lock on F while A and B are held * 3. vput() releases lock on B * 4. vput() releases lock on A * 5. VFS_ROOT() obtains lock on D while shared lock on F is held * 6. vfs_unbusy() releases shared lock on F * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. Note that for stacked * filesystems, D and B in the example above may be the same lock, * which introdues potential lock order reversal deadlock between * dounmount() and step 5 above. These filesystems may avoid the LOR * by setting VV_CROSSLOCK on the covered vnode so that lock B will * remain held until after step 5. */ int vfs_busy(struct mount *mp, int flags) { struct mount_pcpu *mpcpu; MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_mp_count_add_pcpu(mpcpu, lockref, 1); vfs_op_thread_exit(mp, mpcpu); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller the mount it tried to busy is no longer valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("%s: non-empty upper mount list with pending unmount", __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { struct mount_pcpu *mpcpu; int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { goto next_iter; } /* * Handle races against vnode allocation. Filesystems lock the * vnode some time after it gets returned from getnewvnode, * despite type and hold count being manipulated earlier. * Resorting to checking v_mount restores guarantees present * before the global list was reworked to contain all vnodes. */ if (!VI_TRYLOCK(vp)) goto next_iter; if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); goto next_iter; } if (vp->v_mount == NULL) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop_recycle(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->handle == vp && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) { struct vnode *vp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; vp = mvp; for (;;) { if (count == 0) { break; } vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; if (vp->v_holdcnt > 0) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. */ if (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) { continue; } if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { continue; } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); /* * FIXME: ignores the return value, meaning it may be nothing * got recycled but it claims otherwise to the caller. * * Originally the value started being ignored in 2005 with * 114a1006a8204aa156e1f9ad6476cdff89cada7f . * * Respecting the value can run into significant stalls if most * vnodes belong to one file system and it has writes * suspended. In presence of many threads and millions of * vnodes they keep contending on the vnode_list_mtx lock only * to find vnodes they can't recycle. * * The solution would be to pre-check if the vnode is likely to * be recycle-able, but it needs to happen with the * vnode_list_mtx lock held. This runs into a problem where * VOP_GETWRITEMOUNT (currently needed to find out about if * writes are frozen) can take locks which LOR against it. * * Check nullfs for one example (null_getwritemount). */ vtryrecycle(vp); count--; mtx_lock(&vnode_list_mtx); vp = mvp; } return (ocount - count); } static int vnlru_free_locked(int count) { mtx_assert(&vnode_list_mtx, MA_OWNED); return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); } void vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) { MPASS(mnt_op != NULL); MPASS(mvp != NULL); VNPASS(mvp->v_type == VMARKER, mvp); mtx_lock(&vnode_list_mtx); vnlru_free_impl(count, mnt_op, mvp); mtx_unlock(&vnode_list_mtx); } struct vnode * vnlru_alloc_marker(void) { struct vnode *mvp; mvp = vn_alloc_marker(NULL); mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (mvp); } void vnlru_free_marker(struct vnode *mvp) { mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); vn_free_marker(mvp); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static __inline void vfs_freevnodes_inc(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; critical_exit(); } static __inline void vfs_freevnodes_dec(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); vdrop_recycle(vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); vdrop_recycle(vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; vp->v_irflag = 0; v_init_counters(vp); vn_seqc_init(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_free(vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); cache_assert_no_entries(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { /* * Use LK_NOWAIT to shut up witness about the lock. We may get * here while having another vnode locked when trying to * satisfy a lookup and needing to recycle. */ VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); destroy_vpollinfo(vp->v_pollinfo); VOP_UNLOCK(vp); vp->v_pollinfo = NULL; } vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); /* * The caller expects the interlock to be still held. */ ASSERT_VI_LOCKED(vp, __func__); } static int insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); } else { KASSERT(!dtr, ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", __func__)); } /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } /* * Insert into list of vnodes for the new mount point, if available. * insmntque() reclaims the vnode on insertion failure, insmntque1() * leaves handling of the vnode to the caller. */ int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, true)); } int insmntque1(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, false)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); do { error = BO_SYNC(bo, MNT_WAIT); } while (error == ERELOOKUP); if (error != 0) return (error); BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); return (EBUSY); } } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } static void vunlazy(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { if (func == VUNREF) { VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, ("recursive vunref")); vp->v_vflag |= VV_UNREF; } for (;;) { error = vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); if (error != ERELOOKUP || !want_unlock) break; VOP_LOCK(vp, LK_EXCLUSIVE); } if (func == VUNREF) vp->v_vflag &= ~VV_UNREF; vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vfs_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vfs_freevnodes_dec(); return (true); } } } /* * Hold a free vnode for recycling. * * Note: vnode_init references this comment. * * Attempts to recycle only need the global vnode list lock and have no use for * SMR. * * However, vnodes get inserted into the global list before they get fully * initialized and stay there until UMA decides to free the memory. This in * particular means the target can be found before it becomes usable and after * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to * VHOLD_NO_SMR. * * Note: the vnode may gain more references after we transition the count 0->1. */ static bool vhold_recycle_free(struct vnode *vp) { int count; mtx_assert(&vnode_list_mtx, MA_OWNED); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { vfs_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vfs_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } static void __always_inline vdropl_impl(struct vnode *vp, bool enqueue) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if (VN_IS_DOOMED(vp)) { vdropl_final(vp); return; } vfs_freevnodes_inc(); if (vp->v_mflag & VMP_LAZYLIST) { vunlazy(vp); } if (!enqueue) { VI_UNLOCK(vp); return; } /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ vdbatch_enqueue(vp); } void vdropl(struct vnode *vp) { vdropl_impl(vp, true); } /* * vdrop a vnode when recycling * * This is a special case routine only to be used when recycling, differs from * regular vdrop by not requeieing the vnode on LRU. * * Consider a case where vtryrecycle continuously fails with all vnodes (due to * e.g., frozen writes on the filesystem), filling the batch and causing it to * be requeued. Then vnlru will end up revisiting the same vnodes. This is a * loop which can last for as long as writes are frozen. */ static void vdropl_recycle(struct vnode *vp) { vdropl_impl(vp, false); } static void vdrop_recycle(struct vnode *vp) { VI_LOCK(vp); vdropl_recycle(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static int vinactivef(struct vnode *vp) { struct vm_object *obj; int error; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } error = VOP_INACTIVE(vp); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; return (error); } int vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return (0); if (vp->v_iflag & VI_DOINGINACT) return (0); if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return (0); } return (vinactivef(vp)); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } do { error = VOP_FSYNC(vp, MNT_WAIT, td); } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) { struct mount *mp; struct mount_upper_node *ump; mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_notify)) return; MNT_ILOCK(mp); mp->mnt_upper_pending++; KASSERT(mp->mnt_upper_pending > 0, ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump->mp, vp); break; } MNT_ILOCK(mp); } mp->mnt_upper_pending--; if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && mp->mnt_upper_pending == 0) { mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (VN_IS_DOOMED(vp)) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vn_irflag_set_locked(vp, VIRF_DOOMED); /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (!doinginact) { do { if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); } } while (oweinact); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ if (vp->v_lockf != NULL) { (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; } /* * Delete from old mount point vnode list. */ if (vp->v_mount == NULL) { VI_LOCK(vp); } else { delmntque(vp); ASSERT_VI_LOCKED(vp, "vgonel 2"); } /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; short irflag; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; irflag = vn_irflag_read(vp); if (irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); if (irflag & VIRF_MOUNTPOINT) strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); if (irflag & VIRF_TEXT_REF) strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT | VI_FOPENING); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); printf("\n"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOMSYNC); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_RECURSE); MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_NULL_NOCACHE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; int lkflags, objflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); free(vi, M_VNODEPOLL); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque1(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); VOP_UNLOCK(syncvp); save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (KERNEL_PANICKED() || vp == NULL) return; locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } static void vop_fsync_debugprepost(struct vnode *vp, const char *name) { if (vp->v_type == VCHR) ; else if (MNT_EXTENDED_SHARED(vp->v_mount)) ASSERT_VOP_LOCKED(vp, name); else ASSERT_VOP_ELOCKED(vp, name); } void vop_fsync_debugpre(void *a) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fsync_debugpost(void *a, int rc __unused) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpre(void *a) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpost(void *a, int rc __unused) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } #ifdef DEBUG_VFS_LOCKS void vop_mkdir_debugpost(void *ap, int rc) { struct vop_mkdir_args *a; a = ap; if (!rc) cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); } #endif void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_lock(void *arg, int what) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; if (what == LA_LOCKED) ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); else ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && kn->kn_filter != EVFILT_WRITE), ("READ/WRITE filter on a FIFO leaked through")); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct mount_pcpu *mpcpu; struct vnode *vp; int error; if (!vfs_op_thread_enter(mp, mpcpu)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp, mpcpu); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp, mpcpu); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } /* * Special case handling for allocating and freeing vnodes. * * The counter remains unchanged on free so that a doomed vnode will * keep testing as in modify as long as it is accessible with SMR. */ static void vn_seqc_init(struct vnode *vp) { vp->v_seqc = 0; vp->v_seqc_users = 0; } static void vn_seqc_write_end_free(struct vnode *vp) { VNPASS(seqc_in_modify(vp->v_seqc), vp); VNPASS(vp->v_seqc_users == 1, vp); } void vn_irflag_set_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & toset) == 0, vp, ("%s: some of the passed flags already set (have %d, passed %d)\n", __func__, flags, toset)); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_set_cond_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set_cond(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_cond_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_unset_locked(struct vnode *vp, short tounset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & tounset) == tounset, vp, ("%s: some of the passed flags not set (have %d, passed %d)\n", __func__, flags, tounset)); atomic_store_short(&vp->v_irflag, flags & ~tounset); } void vn_irflag_unset(struct vnode *vp, short tounset) { VI_LOCK(vp); vn_irflag_unset_locked(vp, tounset); VI_UNLOCK(vp); } diff --git a/sys/security/mac/mac_syscalls.c b/sys/security/mac/mac_syscalls.c index 9bc334686aa4..ecb451ac2018 100644 --- a/sys/security/mac/mac_syscalls.c +++ b/sys/security/mac/mac_syscalls.c @@ -1,662 +1,663 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * Copyright (c) 2008 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC FEATURE(security_mac, "Mandatory Access Control Framework support"); static int kern___mac_get_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow); static int kern___mac_set_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow); int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { char *elements, *buffer; struct mac mac; struct proc *tproc; struct ucred *tcred; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); tproc = pfind(uap->pid); if (tproc == NULL) return (ESRCH); tcred = NULL; /* Satisfy gcc. */ error = p_cansee(td, tproc); if (error == 0) tcred = crhold(tproc->p_ucred); PROC_UNLOCK(tproc); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); crfree(tcred); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(tcred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); crfree(tcred); return (error); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { char *elements, *buffer; struct mac mac; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = mac_cred_externalize_label(td->td_ucred->cr_label, elements, buffer, mac.m_buflen); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { struct ucred *newcred, *oldcred; struct label *intlabel; struct proc *p; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_CRED)) return (EINVAL); error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_cred_label_alloc(); error = mac_cred_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; newcred = crget(); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; error = mac_cred_check_relabel(oldcred, intlabel); if (error) { PROC_UNLOCK(p); crfree(newcred); goto out; } setsugid(p); crcopy(newcred, oldcred); mac_cred_relabel(newcred, intlabel); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); mac_proc_vm_revoke(td); out: mac_cred_label_free(intlabel); return (error); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { char *elements, *buffer; struct label *intlabel; struct file *fp; struct mac mac; struct vnode *vp; struct pipe *pipe; struct socket *so; cap_rights_t rights; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_GET), &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } vp = fp->f_vnode; intlabel = mac_vnode_label_alloc(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); mac_vnode_copy_label(vp->v_label, intlabel); VOP_UNLOCK(vp); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } pipe = fp->f_data; intlabel = mac_pipe_label_alloc(); PIPE_LOCK(pipe); mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel); PIPE_UNLOCK(pipe); error = mac_pipe_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } so = fp->f_data; intlabel = mac_socket_label_alloc(M_WAITOK); SOCK_LOCK(so); mac_socket_copy_label(so->so_label, intlabel); SOCK_UNLOCK(so); error = mac_socket_externalize_label(intlabel, elements, buffer, mac.m_buflen); mac_socket_label_free(intlabel); break; default: error = EINVAL; } if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (kern___mac_get_path(td, uap->path_p, uap->mac_p, FOLLOW)); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (kern___mac_get_path(td, uap->path_p, uap->mac_p, NOFOLLOW)); } static int kern___mac_get_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow) { char *elements, *buffer; struct nameidata nd; struct label *intlabel; struct mac mac; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL); if (error) { free(elements, M_MACTEMP); return (error); } buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO); NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p); error = namei(&nd); if (error) goto out; intlabel = mac_vnode_label_alloc(); mac_vnode_copy_label(nd.ni_vp->v_label, intlabel); error = mac_vnode_externalize_label(intlabel, elements, buffer, mac.m_buflen); - NDFREE(&nd, 0); + vput(nd.ni_vp); + NDFREE_PNBUF(&nd); mac_vnode_label_free(intlabel); if (error == 0) error = copyout(buffer, mac.m_string, strlen(buffer)+1); out: free(buffer, M_MACTEMP); free(elements, M_MACTEMP); return (error); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { struct label *intlabel; struct pipe *pipe; struct socket *so; struct file *fp; struct mount *mp; struct vnode *vp; struct mac mac; cap_rights_t rights; char *buffer; int error; error = copyin(uap->mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } error = fget(td, uap->fd, cap_rights_init_one(&rights, CAP_MAC_SET), &fp); if (error) goto out; switch (fp->f_type) { case DTYPE_FIFO: case DTYPE_VNODE: if (!(mac_labeled & MPC_OBJECT_VNODE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); if (error) { mac_vnode_label_free(intlabel); break; } vp = fp->f_vnode; error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) { mac_vnode_label_free(intlabel); break; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = vn_setlabel(vp, intlabel, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); mac_vnode_label_free(intlabel); break; case DTYPE_PIPE: if (!(mac_labeled & MPC_OBJECT_PIPE)) { error = EINVAL; goto out_fdrop; } intlabel = mac_pipe_label_alloc(); error = mac_pipe_internalize_label(intlabel, buffer); if (error == 0) { pipe = fp->f_data; PIPE_LOCK(pipe); error = mac_pipe_label_set(td->td_ucred, pipe->pipe_pair, intlabel); PIPE_UNLOCK(pipe); } mac_pipe_label_free(intlabel); break; case DTYPE_SOCKET: if (!(mac_labeled & MPC_OBJECT_SOCKET)) { error = EINVAL; goto out_fdrop; } intlabel = mac_socket_label_alloc(M_WAITOK); error = mac_socket_internalize_label(intlabel, buffer); if (error == 0) { so = fp->f_data; error = mac_socket_label_set(td->td_ucred, so, intlabel); } mac_socket_label_free(intlabel); break; default: error = EINVAL; } out_fdrop: fdrop(fp, td); out: free(buffer, M_MACTEMP); return (error); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (kern___mac_set_path(td, uap->path_p, uap->mac_p, FOLLOW)); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (kern___mac_set_path(td, uap->path_p, uap->mac_p, NOFOLLOW)); } static int kern___mac_set_path(struct thread *td, const char *path_p, struct mac *mac_p, int follow) { struct label *intlabel; struct nameidata nd; struct mount *mp; struct mac mac; char *buffer; int error; if (!(mac_labeled & MPC_OBJECT_VNODE)) return (EINVAL); error = copyin(mac_p, &mac, sizeof(mac)); if (error) return (error); error = mac_check_structmac_consistent(&mac); if (error) return (error); buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK); error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL); if (error) { free(buffer, M_MACTEMP); return (error); } intlabel = mac_vnode_label_alloc(); error = mac_vnode_internalize_label(intlabel, buffer); free(buffer, M_MACTEMP); if (error) goto out; NDINIT(&nd, LOOKUP, LOCKLEAF | follow, UIO_USERSPACE, path_p); error = namei(&nd); if (error == 0) { error = vn_start_write(nd.ni_vp, &mp, V_WAIT | V_PCATCH); if (error == 0) { error = vn_setlabel(nd.ni_vp, intlabel, td->td_ucred); vn_finished_write(mp); } + vput(nd.ni_vp); + NDFREE_PNBUF(&nd); } - - NDFREE(&nd, 0); out: mac_vnode_label_free(intlabel); return (error); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { struct mac_policy_conf *mpc; char target[MAC_MAX_POLICY_NAME]; int error; error = copyinstr(uap->policy, target, sizeof(target), NULL); if (error) return (error); error = ENOSYS; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); goto out; } } if (!LIST_EMPTY(&mac_policy_list)) { mac_policy_slock_sleep(); LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { if (strcmp(mpc->mpc_name, target) == 0 && mpc->mpc_ops->mpo_syscall != NULL) { error = mpc->mpc_ops->mpo_syscall(td, uap->call, uap->arg); break; } } mac_policy_sunlock_sleep(); } out: return (error); } #else /* !MAC */ int sys___mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap) { return (ENOSYS); } int sys___mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap) { return (ENOSYS); } int sys___mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap) { return (ENOSYS); } int sys___mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap) { return (ENOSYS); } int sys___mac_get_file(struct thread *td, struct __mac_get_file_args *uap) { return (ENOSYS); } int sys___mac_get_link(struct thread *td, struct __mac_get_link_args *uap) { return (ENOSYS); } int sys___mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap) { return (ENOSYS); } int sys___mac_set_file(struct thread *td, struct __mac_set_file_args *uap) { return (ENOSYS); } int sys___mac_set_link(struct thread *td, struct __mac_set_link_args *uap) { return (ENOSYS); } int sys_mac_syscall(struct thread *td, struct mac_syscall_args *uap) { return (ENOSYS); } #endif /* !MAC */