Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 366949) +++ head/sys/kern/vfs_cache.c (revision 366950) @@ -1,4625 +1,4655 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #ifdef DDB #include #endif #include static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache"); SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", "const char *"); SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", "struct namecache *", "int", "int"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; u_char neg_hit; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ struct namecache nc_nc; }; /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) #define CACHE_PATH_CUTOFF 39 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ static bool cache_ncp_canuse(struct namecache *ncp) { atomic_thread_fence_acq(); return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); } /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (dvp, name) where dvp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. * * These locks are used (in the order in which they can be taken): * NAME TYPE ROLE * vnodelock mtx vnode lists and v_cache_dd field protection * bucketlock mtx for access to given set of hash buckets * neglist mtx negative entry LRU management * * It is legal to take multiple vnodelock and bucketlock locks. The locking * order is lower address first. Both are recursive. * * "." lookups are lockless. * * ".." and vnode -> name lookups require vnodelock. * * name -> vnode lookup requires the relevant bucketlock to be held for reading. * * Insertions and removals of entries require involved vnodes and bucketlocks * to be locked to provide safe operation against other threads modifying the * cache. * * Some lookups result in removal of the found entry (e.g. getting rid of a * negative entry with the intent to create a positive one), which poses a * problem when multiple threads reach the state. Similarly, two different * threads can purge two different vnodes and try to remove the same name. * * If the already held vnode lock is lower than the second required lock, we * can just take the other lock. However, in the opposite case, this could * deadlock. As such, this is resolved by trylocking and if that fails unlocking * the first node, locking everything in order and revalidating the state. */ VFS_SMR_DECLARE; static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache parameters"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RW, &ncsize, 0, "Total namecache capacity"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); /* * Negative entry % of namecahe capacity above which automatic eviction is allowed. * * Check cache_neg_evict_cond for details. */ static u_int ncnegminpct = 3; static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, "Negative entry count above which automatic eviction is allowed"); /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ struct nchstats nchstats; /* cache effectiveness statistics */ static bool __read_frequently cache_fast_revlookup = true; SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW, &cache_fast_revlookup, 0, ""); static u_int __exclusive_cache_line neg_cycle; #define ncneghash 3 #define numneglists (ncneghash + 1) struct neglist { struct mtx nl_evict_lock; struct mtx nl_lock __aligned(CACHE_LINE_SIZE); TAILQ_HEAD(, namecache) nl_list; TAILQ_HEAD(, namecache) nl_hotlist; u_long nl_hotnum; } __aligned(CACHE_LINE_SIZE); static struct neglist neglists[numneglists]; static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(ncp->nc_flag & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct mtx_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; static struct namecache * cache_alloc(int len, int ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free(struct namecache *ncp) { struct namecache_ts *ncp_ts; MPASS(ncp != NULL); if ((ncp->nc_flag & NCF_DVDROP) != 0) vdrop(ncp->nc_dvp); if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_time; *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define STATNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); STATNODE_ULONG(count, numcache, "Number of cache entries"); STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(dothits, dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothis, dotdothits, "Number of '..' hits"); STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(posszaps, numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(negzaps, numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(fullpathfail1, numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(fullpathfail2, numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); /* * Debug or developer statistics. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache debugging"); #define DEBUGNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define DEBUGNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); DEBUGNODE_COUNTER(zap_bucket_relock_success, zap_bucket_relock_success, "Number of successful removals after relocking"); static long zap_bucket_fail; DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); static long zap_bucket_fail2; DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); static long cache_lock_vnodes_cel_3_failures; DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, size_t *buflen); static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend); static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * TODO: With the value stored we can do better than computing the hash based * on the address. The choice of FNV should also be revisited. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct mtx * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_OWNED); } static void cache_assert_bucket_unlocked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_NOTOWNED); } #else #define cache_assert_bucket_locked(x) do { } while (0) #define cache_assert_bucket_unlocked(x) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_lock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_unlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); static void cache_recalc_neg_min(u_int val) { neg_min = (ncsize * val) / 100; } static int sysctl_negminpct(SYSCTL_HANDLER_ARGS) { u_int val; int error; val = ncnegminpct; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == ncnegminpct) return (0); if (val < 0 || val > 99) return (EINVAL); ncnegminpct = val; cache_recalc_neg_min(val); return (0); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, "I", "Negative entry \% of namecahe capacity above which automatic eviction is allowed"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * Various workloads create plenty of negative entries and barely use them * afterwards. Moreover malicious users can keep performing bogus lookups * adding even more entries. For example "make tinderbox" as of writing this * comment ends up with 2.6M namecache entries in total, 1.2M of which are * negative. * * As such, a rather aggressive eviction method is needed. The currently * employed method is a placeholder. * * Entries are split over numneglists separate lists, each of which is further * split into hot and cold entries. Entries get promoted after getting a hit. * Eviction happens on addition of new entry. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache negative entry statistics"); SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, "Number of negative cache entries"); static COUNTER_U64_DEFINE_EARLY(neg_created); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, "Number of created negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evicted); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, "Number of evicted negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, &neg_evict_skipped_empty, "Number of times evicting failed due to lack of entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, &neg_evict_skipped_missed, "Number of times evicting failed due to target entry disappearing"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, &neg_evict_skipped_contended, "Number of times evicting failed due to contention"); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, "Number of cache hits (negative)"); static int sysctl_neg_hot(SYSCTL_HANDLER_ARGS) { int i, out; out = 0; for (i = 0; i < numneglists; i++) out += neglists[i].nl_hotnum; return (SYSCTL_OUT(req, &out, sizeof(out))); } SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", "Number of hot negative entries"); static void cache_neg_init(struct namecache *ncp) { struct negstate *ns; ncp->nc_flag |= NCF_NEGATIVE; ns = NCP2NEGSTATE(ncp); ns->neg_flag = 0; ns->neg_hit = 0; counter_u64_add(neg_created, 1); } #define CACHE_NEG_PROMOTION_THRESH 2 static bool cache_neg_hit_prep(struct namecache *ncp) { struct negstate *ns; u_char n; ns = NCP2NEGSTATE(ncp); n = atomic_load_char(&ns->neg_hit); for (;;) { if (n >= CACHE_NEG_PROMOTION_THRESH) return (false); if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) break; } return (n + 1 == CACHE_NEG_PROMOTION_THRESH); } /* * Nothing to do here but it is provided for completeness as some * cache_neg_hit_prep callers may end up returning without even * trying to promote. */ #define cache_neg_hit_abort(ncp) do { } while (0) static void cache_neg_hit_finish(struct namecache *ncp) { SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); counter_u64_add(numneghits, 1); } /* * Move a negative entry to the hot list. */ static void cache_neg_promote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); if ((ns->neg_flag & NEG_HOT) == 0) { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum++; ns->neg_flag |= NEG_HOT; } } /* * Move a hot negative entry to the cold list. */ static void cache_neg_demote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); MPASS(ns->neg_flag & NEG_HOT); TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); nl->nl_hotnum--; ns->neg_flag &= ~NEG_HOT; atomic_store_char(&ns->neg_hit, 0); } /* * Move a negative entry to the hot list if it matches the lookup. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Exiting the section means * the found entry could have been evicted. We are going to look it * up again. */ static bool cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, struct namecache *oncp, uint32_t hash) { struct namecache *ncp; struct neglist *nl; u_char nc_flag; nl = NCP2NEGLIST(oncp); mtx_lock(&nl->nl_lock); /* * For hash iteration. */ vfs_smr_enter(); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * XXX There are no provisions to keep the vnode around, meaning we may * end up promoting a negative entry for a *new* vnode and returning * ENOENT on its account. This is the error we want to return anyway * and promotion is harmless. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (__predict_false(!cache_ncp_canuse(ncp))) { goto out_abort; } cache_neg_promote_locked(ncp); cache_neg_hit_finish(ncp); vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (true); out_abort: vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (false); } static void cache_neg_promote(struct namecache *ncp) { struct neglist *nl; nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); cache_neg_promote_locked(ncp); mtx_unlock(&nl->nl_lock); } static void cache_neg_insert(struct namecache *ncp) { struct neglist *nl; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); mtx_unlock(&nl->nl_lock); atomic_add_long(&numneg, 1); } static void cache_neg_remove(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); ns = NCP2NEGSTATE(ncp); mtx_lock(&nl->nl_lock); if ((ns->neg_flag & NEG_HOT) != 0) { TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum--; } else { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); } mtx_unlock(&nl->nl_lock); atomic_subtract_long(&numneg, 1); } static struct neglist * cache_neg_evict_select_list(void) { struct neglist *nl; u_int c; c = atomic_fetchadd_int(&neg_cycle, 1) + 1; nl = &neglists[c % numneglists]; if (!mtx_trylock(&nl->nl_evict_lock)) { counter_u64_add(neg_evict_skipped_contended, 1); return (NULL); } return (nl); } static struct namecache * cache_neg_evict_select_entry(struct neglist *nl) { struct namecache *ncp, *lncp; struct negstate *ns, *lns; int i; mtx_assert(&nl->nl_evict_lock, MA_OWNED); mtx_assert(&nl->nl_lock, MA_OWNED); ncp = TAILQ_FIRST(&nl->nl_list); if (ncp == NULL) return (NULL); lncp = ncp; lns = NCP2NEGSTATE(lncp); for (i = 1; i < 4; i++) { ncp = TAILQ_NEXT(ncp, nc_dst); if (ncp == NULL) break; ns = NCP2NEGSTATE(ncp); if (ns->neg_hit < lns->neg_hit) { lncp = ncp; lns = ns; } } return (lncp); } static bool cache_neg_evict(void) { struct namecache *ncp, *ncp2; struct neglist *nl; struct negstate *ns; struct vnode *dvp; struct mtx *dvlp; struct mtx *blp; uint32_t hash; u_char nlen; bool evicted; nl = cache_neg_evict_select_list(); if (nl == NULL) { return (false); } mtx_lock(&nl->nl_lock); ncp = TAILQ_FIRST(&nl->nl_hotlist); if (ncp != NULL) { cache_neg_demote_locked(ncp); } ncp = cache_neg_evict_select_entry(nl); if (ncp == NULL) { counter_u64_add(neg_evict_skipped_empty, 1); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); return (false); } ns = NCP2NEGSTATE(ncp); nlen = ncp->nc_nlen; dvp = ncp->nc_dvp; hash = cache_get_hash(ncp->nc_name, nlen, dvp); dvlp = VP2VNODELOCK(dvp); blp = HASH2BUCKETLOCK(hash); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); mtx_lock(dvlp); mtx_lock(blp); /* * Note that since all locks were dropped above, the entry may be * gone or reallocated to be something else. */ CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { if (ncp2 == ncp && ncp2->nc_dvp == dvp && ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) break; } if (ncp2 == NULL) { counter_u64_add(neg_evict_skipped_missed, 1); ncp = NULL; evicted = false; } else { MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); MPASS(blp == NCP2BUCKETLOCK(ncp)); SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(neg_evicted, 1); evicted = true; } mtx_unlock(blp); mtx_unlock(dvlp); if (ncp != NULL) cache_free(ncp); return (evicted); } /* * Maybe evict a negative entry to create more room. * * The ncnegfactor parameter limits what fraction of the total count * can comprise of negative entries. However, if the cache is just * warming up this leads to excessive evictions. As such, ncnegminpct * (recomputed to neg_min) dictates whether the above should be * applied. * * Try evicting if the cache is close to full capacity regardless of * other considerations. */ static bool cache_neg_evict_cond(u_long lnumcache) { u_long lnumneg; if (ncsize - 1000 < lnumcache) goto out_evict; lnumneg = atomic_load_long(&numneg); if (lnumneg < neg_min) return (false); if (lnumneg * ncnegfactor < lnumcache) return (false); out_evict: return (cache_neg_evict()); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(ncp->nc_vp); cache_assert_vnode_locked(ncp->nc_dvp); cache_assert_bucket_locked(ncp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp); TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) { vn_seqc_write_begin_unheld(ncp->nc_vp); ncp->nc_vp->v_cache_dd = NULL; vn_seqc_write_end(ncp->nc_vp); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, ncp->nc_name); cache_neg_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) { vn_seqc_write_begin_unheld(ncp->nc_dvp); ncp->nc_dvp->v_cache_dd = NULL; vn_seqc_write_end(ncp->nc_dvp); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; counter_u64_add(numcachehv, -1); } } atomic_subtract_long(&numcache, 1); } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct mtx *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct mtx *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct mtx *blp) { struct namecache *rncp; cache_assert_bucket_unlocked(ncp); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); mtx_lock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) break; } if (rncp != NULL) { cache_zap_locked(rncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); counter_u64_add(zap_bucket_relock_success, 1); return (0); } mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct mtx *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; mtx_unlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static __noinline int cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { vn_seqc_write_begin(dvp); dvp->v_cache_dd = NULL; vn_seqc_write_end(dvp); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); return (1); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (ncp == NULL) { mtx_unlock(blp); goto out_no_entry; } error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail++; goto retry; } counter_u64_add(numposzaps, 1); SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); cache_free(ncp); return (1); out_no_entry: counter_u64_add(nummisszap, 1); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static int __noinline cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct mtx *dvlp; enum vgetstate vs; int error, ltype; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) != 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { cache_remove_cnp(dvp, cnp); return (0); } counter_u64_add(dotdothits, 1); retry: dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; if (*vpp == NULL) goto negative_success; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } MPASS(dvp != *vpp); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); vs = vget_prep(*vpp); mtx_unlock(dvlp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); cache_zap_negative_locked_vnode_kl(ncp, dvp); mtx_unlock(dvlp); cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(dvlp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * Either both tsp and ticks have to be provided or neither of them. * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ static int __noinline cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; enum vgetstate vs; int error; bool whiteout; - MPASS((cnp->cn_flags & (MAKEENTRY | ISDOTDOT)) == MAKEENTRY); + MPASS((cnp->cn_flags & ISDOTDOT) == 0); + MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); retry: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { mtx_unlock(blp); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); vs = vget_prep(*vpp); mtx_unlock(blp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_bucket_fail2++; goto retry; } cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(blp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; enum vgetstate vs; int error; bool whiteout, neg_promote; u_short nc_flag; MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); } MPASS((cnp->cn_flags & ISDOTDOT) == 0); - if ((cnp->cn_flags & MAKEENTRY) == 0) { + if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { cache_remove_cnp(dvp, cnp); return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); vfs_smr_enter(); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { vfs_smr_exit(); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummiss, 1); return (0); } nc_flag = atomic_load_char(&ncp->nc_flag); if (nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto out_fallback; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto out_fallback; } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto out_fallback; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { vfs_smr_exit(); goto out_fallback; } } cache_out_ts(ncp, tsp, ticksp); whiteout = (ncp->nc_flag & NCF_WHITE); neg_promote = cache_neg_hit_prep(ncp); if (__predict_false(!cache_ncp_canuse(ncp))) { cache_neg_hit_abort(ncp); vfs_smr_exit(); goto out_fallback; } if (neg_promote) { vfs_smr_exit(); if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) goto out_fallback; } else { cache_neg_hit_finish(ncp); vfs_smr_exit(); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); out_fallback: return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); } struct celockstate { struct mtx *vlp[3]; struct mtx *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, struct mtx *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { mtx_lock(blp1); cel->blp[0] = blp1; } mtx_lock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) mtx_unlock(cel->blp[0]); mtx_unlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = vp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if (ncp->nc_flag & NCF_NEGATIVE) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = dvp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if (ncp->nc_flag & NCF_NEGATIVE) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (dvp->v_cache_dd == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); vn_seqc_write_begin(dvp); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } dvp->v_cache_dd = NULL; vn_seqc_write_end(dvp); cache_enter_unlock(&cel); if (ncp != NULL) cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts; struct nchashhead *ncpp; uint32_t hash; int flag; int len; u_long lnumcache; VNPASS(dvp != vp, dvp); VNPASS(!VN_IS_DOOMED(dvp), dvp); VNPASS(dvp->v_type != VNON, dvp); if (vp != NULL) { VNPASS(!VN_IS_DOOMED(vp), vp); VNPASS(vp->v_type != VNON, vp); } #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } /* * Avoid blowout in namecache entries. * * Bugs: * 1. filesystems may end up tryng to add an already existing entry * (for example this can happen after a cache miss during concurrent * lookup), in which case we will call cache_neg_evict despite not * adding anything. * 2. the routine may fail to free anything and no provisions are made * to make it try harder (see the inside for failure modes) * 3. it only ever looks at negative entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (cache_neg_evict_cond(lnumcache)) { lnumcache = atomic_load_long(&numcache); } if (__predict_false(lnumcache >= ncsize)) { atomic_subtract_long(&numcache, 1); counter_u64_add(numdrops, 1); return; } cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_neg_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); CK_SLIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { MPASS(cache_ncp_canuse(n2)); if ((n2->nc_flag & NCF_NEGATIVE) != 0) KASSERT(vp == NULL, ("%s: found entry pointing to a different vnode (%p != %p)", __func__, NULL, vp)); else KASSERT(n2->nc_vp == vp, ("%s: found entry pointing to a different vnode (%p != %p)", __func__, n2->nc_vp, vp)); /* * Entries are supposed to be immutable unless in the * process of getting destroyed. Accommodating for * changing timestamps is possible but not worth it. * This should be harmless in terms of correctness, in * the worst case resulting in an earlier expiration. * Alternatively, the found entry can be replaced * altogether. */ MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); #if 0 if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } #endif SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, vp); goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); vn_seqc_write_begin(dvp); dvp->v_cache_dd = ncp; vn_seqc_write_end(dvp); } if (vp != NULL) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ vn_seqc_write_begin(vp); if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } vp->v_cache_dd = ncp; vn_seqc_write_end(vp); } else if (vp->v_type != VDIR) { if (vp->v_cache_dd != NULL) { vn_seqc_write_begin(vp); vp->v_cache_dd = NULL; vn_seqc_write_end(vp); } } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); counter_u64_add(numcachehv, 1); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; cache_neg_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (ndd != NULL) cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); atomic_subtract_long(&numcache, 1); cache_free(ncp); return; } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; cache_recalc_neg_min(ncnegminpct); nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); TAILQ_INIT(&neglists[i].nl_hotlist); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; u_long newncsize; int i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } ncsize = newncsize; cache_recalc_neg_min(ncnegminpct); cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); } /* * Invalidate all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp, *vlp2; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Opportunistic check to see if there is anything to do. */ static bool cache_has_entries(struct vnode *vp) { if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL) return (false); return (true); } void cache_purge(struct vnode *vp) { SDT_PROBE1(vfs, namecache, purge, done, vp); if (!cache_has_entries(vp)) return; cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } /* * Serialize against a potential thread doing cache_purge. */ vlp = VP2VNODELOCK(vp); mtx_wait_unlocked(vlp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } return; } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } mtx_unlock(vlp); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } void cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { ASSERT_VOP_IN_SEQC(fdvp); ASSERT_VOP_IN_SEQC(fvp); ASSERT_VOP_IN_SEQC(tdvp); if (tvp != NULL) ASSERT_VOP_IN_SEQC(tvp); cache_purge(fvp); if (tvp != NULL) { cache_purge(tvp); KASSERT(!cache_remove_cnp(tdvp, tcnp), ("%s: lingering negative entry", __func__)); } else { cache_remove_cnp(tdvp, tcnp); } } + +#ifdef INVARIANTS +/* + * Validate that if an entry exists it matches. + */ +void +cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) +{ + struct namecache *ncp; + struct mtx *blp; + uint32_t hash; + + hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + if (CK_SLIST_EMPTY(NCHHASH(hash))) + return; + blp = HASH2BUCKETLOCK(hash); + mtx_lock(blp); + CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { + if (ncp->nc_vp != vp) + panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p vp %p\n", + __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp, + ncp->nc_vp); + } + } + mtx_unlock(blp); +} +#endif /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp) { struct vnode *vp, *mvp; SDT_PROBE1(vfs, namecache, purgevfs, done, mp); /* * Somewhat wasteful iteration over all vnodes. Would be better to * support filtering and avoid the interlock to begin with. */ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (!cache_has_entries(vp)) { VI_UNLOCK(vp); continue; } vholdl(vp); VI_UNLOCK(vp); cache_purge(vp); vdrop(vp); } } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) return (error); error = vn_fullpath_hardlink(&nd, &retbuf, &freebuf, &size); if (error == 0) { error = copyout(retbuf, buf, size); free(freebuf, M_TEMP); } NDFREE(&nd, 0); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } static struct namecache * vn_dd_from_dst(struct vnode *vp) { struct namecache *ncp; cache_assert_vnode_locked(vp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) return (ncp); } return (NULL); } int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); ncp = (*vp)->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { KASSERT(ncp == vn_dd_from_dst(*vp), ("%s: mismatch for dd entry (%p != %p)", __func__, ncp, vn_dd_from_dst(*vp))); } else { ncp = vn_dd_from_dst(*vp); } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; bool slash_prefixed; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; slash_prefixed = true; if (addend == 0) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; slash_prefixed = false; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static void __inline cache_rev_failed_impl(int *reason, int line) { *reason = line; } #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *tvp; struct mount *mp; struct namecache *ncp; size_t orig_buflen; int reason; int error; #ifdef KDTRACE_HOOKS int i; #endif seqc_t vp_seqc, tvp_seqc; u_char nc_flag; VFS_SMR_ASSERT_ENTERED(); if (!cache_fast_revlookup) { vfs_smr_exit(); return (-1); } orig_buflen = *buflen; if (addend == 0) { MPASS(*buflen >= 2); *buflen -= 1; buf[*buflen] = '\0'; } if (vp == rdir || vp == rootvnode) { if (addend == 0) { *buflen -= 1; buf[*buflen] = '/'; } goto out_ok; } #ifdef KDTRACE_HOOKS i = 0; #endif error = -1; ncp = NULL; /* for sdt probe down below */ vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } for (;;) { #ifdef KDTRACE_HOOKS i++; #endif if ((vp->v_vflag & VV_ROOT) != 0) { mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) { cache_rev_failed(&reason); goto out_abort; } tvp = atomic_load_ptr(&mp->mnt_vnodecovered); tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; continue; } ncp = atomic_load_ptr(&vp->v_cache_dd); if (ncp == NULL) { cache_rev_failed(&reason); goto out_abort; } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { cache_rev_failed(&reason); goto out_abort; } if (!cache_ncp_canuse(ncp)) { cache_rev_failed(&reason); goto out_abort; } if (ncp->nc_nlen >= *buflen) { cache_rev_failed(&reason); error = ENOMEM; goto out_abort; } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); *buflen -= 1; buf[*buflen] = '/'; tvp = ncp->nc_dvp; tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; if (vp == rdir || vp == rootvnode) break; } out_ok: vfs_smr_exit(); *retbuf = buf + *buflen; *buflen = orig_buflen - *buflen + addend; SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); return (0); out_abort: *buflen = orig_buflen; SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); vfs_smr_exit(); return (error); } static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen, addend; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); addend = 0; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; addend = orig_buflen - *buflen; } return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * * Since the namecache does not track handlings, the caller is expected to first * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * fullowing names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; struct componentname *cnp; struct vnode *vp; size_t addend; int error; enum vtype type; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; buf = malloc(*buflen, M_TEMP, M_WAITOK); addend = 0; vp = ndp->ni_vp; /* * Check for VBAD to work around the vp_crossmp bug in lookup(). * * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be * set to mount point's root vnode while ni_dvp will be vp_crossmp. * If the type is VDIR (like in this very case) we can skip looking * at ni_dvp in the first place. However, since vnodes get passed here * unlocked the target may transition to doomed state (type == VBAD) * before we get to evaluate the condition. If this happens, we will * populate part of the buffer and descend to vn_fullpath_dir with * vp == vp_crossmp. Prevent the problem by checking for VBAD. * * This should be atomic_load(&vp->v_type) but it is ilegal to take * an address of a bit field, even if said field is sized to char. * Work around the problem by reading the value into a full-sized enum * and then re-reading it with atomic_load which will still prevent * the compiler from re-reading down the road. */ type = vp->v_type; type = atomic_load_int(&type); if (type == VBAD) { error = ENOENT; goto out_bad; } if (type != VDIR) { cnp = &ndp->ni_cnd; addend = cnp->cn_namelen + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); tmpbuf[addend - 1] = '\0'; vp = ndp->ni_dvp; } vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); vref(vp); error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); pwd_drop(pwd); if (error != 0) goto out_bad; } *freebuf = buf; return (0); out_bad: free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif static bool __read_frequently cache_fast_lookup = true; SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, &cache_fast_lookup, 0, ""); #define CACHE_FPL_FAILED -2020 static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } static void cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) { struct componentname *cnp; cnp = &ndp->ni_cnd; while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } *dpp = ndp->ni_rootdir; } /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_saved { long cn_namelen; char *cn_nameptr; size_t ni_pathlen; int cn_flags; }; struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; struct pwd *pwd; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; struct nameidata_saved snd; int line; enum cache_fpl_status status:8; bool in_smr; bool fsearch; }; static void cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) { snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; snd->ni_pathlen = fpl->ndp->ni_pathlen; } static void cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) { fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; fpl->ndp->ni_pathlen = snd->ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); MPASS(error != CACHE_FPL_FAILED); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (error); } #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = cnp->cn_thread; if (!cache_fast_lookup) { cache_fpl_aborted(fpl); return (false); } #ifdef MAC if (mac_vnode_check_lookup_enabled()) { cache_fpl_aborted(fpl); return (false); } #endif if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted(fpl); return (false); } if (IN_CAPABILITY_MODE(td)) { cache_fpl_aborted(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted(fpl); return (false); } return (true); } static int cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) { struct nameidata *ndp; int error; bool fsearch; ndp = fpl->ndp; error = fgetvp_lookup_smr(ndp->ni_dirfd, ndp, vpp, &fsearch); if (__predict_false(error != 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_aborted(fpl)); } fpl->fsearch = fsearch; return (0); } static bool cache_fplookup_vnode_supported(struct vnode *vp) { return (vp->v_type != VLNK); } static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct vnode *dvp; cnp = fpl->cnp; dvp = fpl->dvp; cache_fpl_smr_exit(fpl); if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) return (cache_fpl_handled(fpl, ENOENT)); else return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; pwd = fpl->pwd; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; if (!pwd_hold_smr(pwd)) { cache_fpl_smr_exit(fpl); return (cache_fpl_aborted(fpl)); } dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore(fpl, &fpl->snd); ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl, 0)); } /* * They want to possibly modify the state of the namecache. * * Don't try to match the API contract, just leave. * TODO: this leaves scalability on the table */ static int cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct componentname *cnp; cnp = fpl->cnp; MPASS(cnp->cn_nameiop != LOOKUP); return (cache_fpl_partial(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; VNPASS(cache_fplookup_vnode_supported(dvp), dvp); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { struct vnode *dvp; dvp = fpl->dvp; fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); return (0); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *dvp; struct prison *pr; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * XXX this is racy the same way regular lookup is */ for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dvp == pr->pr_root) break; if (dvp == ndp->ni_rootdir || dvp == ndp->ni_topdir || dvp == rootvnode || pr != NULL) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_aborted(fpl)); } ncp = atomic_load_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_aborted(fpl)); } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(dotdothits, 1); return (0); } static int __noinline cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) { u_char nc_flag; bool neg_promote; nc_flag = atomic_load_char(&ncp->nc_flag); MPASS((nc_flag & NCF_NEGATIVE) != 0); /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { /* * TODO * This should call something similar to * cache_fplookup_final_modifying. */ return (cache_fpl_partial(fpl)); } neg_promote = cache_neg_hit_prep(ncp); if (__predict_false(!cache_ncp_canuse(ncp))) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (__predict_false((nc_flag & NCF_WHITE) != 0)) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (neg_promote) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } cache_neg_hit_finish(ncp); cache_fpl_smr_exit(fpl); return (cache_fpl_handled(fpl, ENOENT)); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; cnp = fpl->cnp; dvp = fpl->dvp; if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { return (cache_fplookup_dot(fpl)); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* * If there is no entry we have to punt to the slow path to perform * actual lookup. Should there be nothing with this name a negative * entry will be created. */ if (__predict_false(ncp == NULL)) { return (cache_fpl_partial(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { return (cache_fplookup_neg(fpl, ncp, hash)); } if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } if (!cache_fplookup_vnode_supported(tvp)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); return (0); } static bool cache_fplookup_mp_supported(struct mount *mp) { if (mp == NULL) return (false); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) return (0); prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } prev_mp = mp; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static bool cache_fplookup_need_climb_mount(struct cache_fpl *fpl) { struct mount *mp; struct vnode *vp; vp = fpl->tvp; /* * Hack: while this is a union, the pointer tends to be NULL so save on * a branch. */ mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) return (false); if (vp->v_type == VDIR) return (true); return (false); } /* * Parse the path. * * The code was originally copy-pasted from regular lookup and despite * clean ups leaves performance on the table. Any modifications here * must take into account that in case off fallback the resulting * nameidata state has to be compatible with the original. */ static int cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; char *cp; ndp = fpl->ndp; cnp = fpl->cnp; /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled(fpl, ENAMETOOLONG)); } ndp->ni_pathlen -= cnp->cn_namelen; KASSERT(ndp->ni_pathlen <= PATH_MAX, ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { /* * TODO * Regular lookup performs the following: * *ndp->ni_next = '\0'; * cnp->cn_flags |= TRAILINGSLASH; * * Which is problematic since it modifies data read * from userspace. Then if fast path lookup was to * abort we would have to either restore it or convey * the flag. Since this is a corner case just ignore * it for simplicity. */ return (cache_fpl_partial(fpl)); } } ndp->ni_next = cp; /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". * * TODO * Another corner case handled by the regular lookup */ if (__predict_false(cnp->cn_nameptr[0] == '\0')) { return (cache_fpl_partial(fpl)); } return (0); } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } } /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { struct componentname *cnp; struct vnode *dvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; /* * Hack: they may be looking up foo/bar, where foo is a * regular file. In such a case we need to turn ENOTDIR, * but we may happen to get here with a different error. */ if (dvp->v_type != VDIR) { /* * The check here is predominantly to catch * EOPNOTSUPP from dead_vnodeops. If the vnode * gets doomed past this point it is going to * fail seqc verification. */ if (VN_IS_DOOMED(dvp)) { return (cache_fpl_aborted(fpl)); } error = ENOTDIR; } /* * Hack: handle O_SEARCH. * * Open Group Base Specifications Issue 7, 2018 edition states: * If the access mode of the open file description associated with the * file descriptor is not O_SEARCH, the function shall check whether * directory searches are permitted using the current permissions of * the directory underlying the file descriptor. If the access mode is * O_SEARCH, the function shall not perform the check. * * Regular lookup tests for the NOEXECCHECK flag for every path * component to decide whether to do the permission check. However, * since most lookups never have the flag (and when they do it is only * present for the first path component), lockless lookup only acts on * it if there is a permission problem. Here the flag is represented * with a boolean so that we don't have to clear it on the way out. * * For simplicity this always aborts. * TODO: check if this is the first lookup and ignore the permission * problem. Note the flag has to survive fallback (if it happens to be * performed). */ if (fpl->fsearch) { return (cache_fpl_aborted(fpl)); } switch (error) { case EAGAIN: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_partial(fpl); } break; default: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; error = CACHE_FPL_FAILED; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl, &fpl->snd); fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { cache_fpl_aborted(fpl); goto out; } mp = atomic_load_ptr(&fpl->dvp->v_mount); if (!cache_fplookup_mp_supported(mp)) { cache_fpl_aborted(fpl); goto out; } VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); for (;;) { error = cache_fplookup_parse(fpl); if (__predict_false(error != 0)) { break; } VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } if (__predict_false(cache_fpl_isdotdot(cnp))) { error = cache_fplookup_dotdot(fpl); if (__predict_false(error != 0)) { break; } } else { error = cache_fplookup_next(fpl); if (__predict_false(error != 0)) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (cache_fplookup_need_climb_mount(fpl)) { error = cache_fplookup_climb_mount(fpl); if (__predict_false(error != 0)) { break; } } } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); cache_fpl_checkpoint(fpl, &fpl->snd); } out: switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_PARTIAL: cache_fpl_smr_assert_entered(fpl); return (cache_fplookup_partial_setup(fpl)); case CACHE_FPL_STATUS_ABORTED: if (fpl->in_smr) cache_fpl_smr_exit(fpl); return (CACHE_FPL_FAILED); case CACHE_FPL_STATUS_HANDLED: MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); if (__predict_false(error != 0)) { ndp->ni_dvp = NULL; ndp->ni_vp = NULL; cache_fpl_cleanup_cnp(cnp); return (error); } ndp->ni_dvp = fpl->dvp; ndp->ni_vp = fpl->tvp; if (cnp->cn_flags & SAVENAME) cnp->cn_flags |= HASBUF; else cache_fpl_cleanup_cnp(cnp); return (error); } } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; struct nameidata_saved orig; int error; MPASS(ndp->ni_lcf == 0); fpl.status = CACHE_FPL_STATUS_UNSET; fpl.ndp = ndp; fpl.cnp = &ndp->ni_cnd; MPASS(curthread == fpl.cnp->cn_thread); if ((fpl.cnp->cn_flags & SAVESTART) != 0) MPASS(fpl.cnp->cn_nameiop != LOOKUP); if (!cache_can_fplookup(&fpl)) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); *status = fpl.status; return (EOPNOTSUPP); } cache_fpl_checkpoint(&fpl, &orig); cache_fpl_smr_enter_initial(&fpl); fpl.fsearch = false; pwd = pwd_get_smr(); fpl.pwd = pwd; ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; cnp = fpl.cnp; cnp->cn_nameptr = cnp->cn_pnbuf; if (cnp->cn_pnbuf[0] == '/') { cache_fpl_handle_root(ndp, &dvp); } else { if (ndp->ni_dirfd == AT_FDCWD) { dvp = pwd->pwd_cdir; } else { error = cache_fplookup_dirfd(&fpl, &dvp); if (__predict_false(error != 0)) { goto out; } } } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); out: cache_fpl_smr_assert_not_entered(&fpl); SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); *status = fpl.status; switch (fpl.status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_HANDLED: SDT_PROBE3(vfs, namei, lookup, return, error, (error == 0 ? ndp->ni_vp : NULL), true); break; case CACHE_FPL_STATUS_PARTIAL: *pwdp = fpl.pwd; /* * Status restored by cache_fplookup_partial_setup. */ break; case CACHE_FPL_STATUS_ABORTED: cache_fpl_restore(&fpl, &orig); break; } return (error); } Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c (revision 366949) +++ head/sys/kern/vfs_subr.c (revision 366950) @@ -1,6737 +1,6749 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * These fences are intended for cases where some synchronization is * needed between access of v_iflags and lockless vnode refcount (v_holdcnt * and v_usecount) updates. Access to v_iflags is generally synchronized * by the interlock, but we have some internal assertions that check vnode * flags without acquiring the lock. Thus, these fences are INVARIANTS-only * for now. */ #ifdef INVARIANTS #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() #else #define VNODE_REFCOUNT_FENCE_ACQ() #define VNODE_REFCOUNT_FENCE_REL() #endif /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; int cpu, physvnodes, virtvnodes; u_int i; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); uma_zone_set_smr(vnode_zone, vfs_smr); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mp, ref, 1); vfs_mp_count_add_pcpu(mp, lockref, 1); vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mp, lockref, 1); vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; if (!VI_TRYLOCK(vp)) goto next_iter; if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || VN_IS_DOOMED(vp) || vp->v_type == VNON) { VI_UNLOCK(vp); goto next_iter; } object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_locked(int count, struct vfsops *mnt_op) { struct vnode *vp, *mvp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; mvp = vnode_list_free_marker; restart: vp = mvp; while (count > 0) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. * Don't recycle if we can't get the interlock without * blocking. */ if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { continue; } TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); continue; } vholdl(vp); count--; mtx_unlock(&vnode_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); vdrop(vp); mtx_lock(&vnode_list_mtx); goto restart; } return (ocount - count); } void vnlru_free(int count, struct vfsops *mnt_op) { mtx_lock(&vnode_list_mtx); vnlru_free_locked(count, mnt_op); mtx_unlock(&vnode_list_mtx); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static __inline void vn_freevnodes_inc(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; critical_exit(); } static __inline void vn_freevnodes_dec(void) { struct vdbatch *vd; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1, NULL) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1, NULL); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; v_init_counters(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_locked(vp); VNPASS(vp->v_seqc_users == 0, vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); vp->v_pollinfo = NULL; } #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_irflag = 0; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vn_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vn_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void vdrop_deactivate(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(!VN_IS_DOOMED(vp), vp, ("vdrop: returning doomed vnode")); VNASSERT(vp->v_op != NULL, vp, ("vdrop: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("vnode with VI_OWEINACT set")); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("vnode with VI_DEFINACT set")); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } vdbatch_enqueue(vp); } static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vn_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } if (!VN_IS_DOOMED(vp)) { vn_freevnodes_inc(); vdrop_deactivate(vp); /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ return; } vdropl_final(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static void vinactivef(struct vnode *vp) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } void vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return; if (vp->v_iflag & VI_DOINGINACT) return; if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return; } vinactivef(vp); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_uppers)) return; mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_irflag & VIRF_DOOMED) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vp->v_irflag |= VIRF_DOOMED; /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if ((oweinact || active) && !doinginact) { VI_LOCK(vp); vinactivef(vp); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (vp->v_irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); flags = vp->v_irflag & ~(VIRF_DOOMED | VIRF_PGREAD); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_TEXT_REF) strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; int lkflags, objflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; int need; MPASS(mtx_owned(VI_MTX(vp))); need = 0; if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) need = 1; return (need); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to suppress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } +#ifdef DEBUG_VFS_LOCKS +void +vop_mkdir_debugpost(void *ap, int rc) +{ + struct vop_mkdir_args *a; + + a = ap; + if (!rc) + cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); +} +#endif + void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; if (!vfs_op_thread_enter(mp)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_unheld_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); vn_seqc_write_begin_unheld_locked(vp); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_begin_unheld(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_unheld_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 366949) +++ head/sys/kern/vfs_syscalls.c (revision 366950) @@ -1,4930 +1,4930 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td); static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag); static uint64_t at2cnpflags(u_int at_flags, u_int mask) { u_int64_t res; MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) != (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)); res = 0; at_flags &= mask; if ((at_flags & AT_BENEATH) != 0) res |= BENEATH; if ((at_flags & AT_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((at_flags & AT_SYMLINK_FOLLOW) != 0) res |= FOLLOW; /* NOFOLLOW is pseudo flag */ if ((mask & AT_SYMLINK_NOFOLLOW) != 0) { res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; } return (res); } int kern_sync(struct thread *td) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { return (kern_sync(td)); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); if (error != 0) { vfs_rel(mp); return (error); } error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg); /* * Since quota on operation typically needs to open quota * file, the Q_QUOTAON handler needs to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is our, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) vfs_unbusy(mp); vfs_rel(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); if (error != 0) goto out; if (priv_check_cred_vfs_generation(td->td_ucred)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error != 0) return (error); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; int error; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef AUDIT AUDIT_ARG_VNODE1(vp); #endif mp = vp->v_mount; if (mp != NULL) vfs_ref(mp); VOP_UNLOCK(vp); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; /* * If there is no target buffer they only want the count. * * This could be TAILQ_FOREACH but it is open-coded to match the original * code below. */ if (sfsp == NULL) { mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif count++; nmp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); *countp = count; return (0); } /* * They want the entire thing. * * Short-circuit the corner case of no room for anything, avoids * relocking below. */ if (maxcount < 1) { goto out; } mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } sp = &mp->mnt_stat; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check_cred_vfs_generation(td->td_ucred)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; count++; if (count == maxcount) { vfs_unbusy(mp); goto out; } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); out: *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(uap->fd); error = getvnode(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vrefact(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); pwd_chdir(td, nd.ni_vp); return (0); } /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) goto error; error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set_one(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set_one(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set_one(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set_one(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set_one(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set_one(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set_one(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set_one(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. */ if (flags & O_EXEC) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and set by finstall() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &rights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) goto success; /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_resflags & NIRES_STRICTREL) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); finit_vnode(fp, flags, NULL, &vnops); } VOP_UNLOCK(vp); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_resflags & NIRES_STRICTREL) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall(td, fp, &indx, flags, fcaps); /* On success finstall() consumes fcaps. */ if (error != 0) { filecaps_free(&nd.ni_filecaps); goto bad; } } else { filecaps_free(&nd.ni_filecaps); } /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mknodat_rights, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if ((flag & ~(AT_SYMLINK_FOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, at2cnpflags(flag, AT_SYMLINK_FOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH))); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, const char *path2, enum uio_seg segflag, int follow) { struct nameidata nd; int error; do { bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag, path1, fd1, &cap_linkat_source_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); } while (error == EAGAIN); return (error); } static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag) { struct nameidata nd; struct mount *mp; int error; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, &cap_linkat_target_rights, td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); return (EAGAIN); } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp); vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); } else { vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); return (EAGAIN); } } vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; const char *syspath; char *tmppath; struct nameidata nd; int error; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { tmppath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0) goto out; syspath = tmppath; } AUDIT_ARG_TEXT(syspath); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path2, fd, &cap_symlinkat_rights, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0, 0)); } static int kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd, int flag, enum uio_seg pathseg, ino_t oldinum) { if ((flag & ~AT_REMOVEDIR) != 0) return (EINVAL); if ((flag & AT_REMOVEDIR) != 0) return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0)); return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct funlinkat_args { int dfd; const char *path; int fd; int flag; }; #endif int sys_funlinkat(struct thread *td, struct funlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag, UIO_USERSPACE, 0)); } int kern_funlinkat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag, ino_t oldinum) { struct mount *mp; struct file *fp; struct vnode *vp; struct nameidata nd; struct stat sb; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_BENEATH | AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) { if (error == EINVAL) error = EPERM; goto fdout; } vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) { goto fdout; } goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); fdout: if (fp != NULL) fdrop(fp, td); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_seek_rights, &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; int error; if ((flag & ~(AT_EACCESS | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_BENEATH | AT_RESOLVE_BENEATH), pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. * XXX: many values are blindly truncated. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_size = MIN(st->st_size, INT32_MAX); ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of device, file or inode number, or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; if (ost->st_dev != st->st_dev) { switch (ino64_trunc_error) { default: /* * Since dev_t is almost raw, don't clamp to the * maximum for case 2, but ignore the error. */ break; case 1: return (EOVERFLOW); } } ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (ost->st_rdev != st->st_rdev) { switch (ino64_trunc_error) { default: break; case 1: return (EOVERFLOW); } } ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_BENEATH | AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) return (error); error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td); if (error == 0) { if (__predict_false(hook != NULL)) hook(nd.ni_vp, sbp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { bzero(nsb, sizeof(*nsb)); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atim = sb->st_atim; nsb->st_mtim = sb->st_mtim; nsb->st_ctim = sb->st_ctim; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtim = sb->st_birthtim; } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; error = kern_readlink_vp(vp, buf, bufseg, count, td); vput(vp); return (error); } /* * Helper function to readlink from a vnode */ static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td) { struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) return (error); #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flags, uap->atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; int error; AUDIT_ARG_FFLAGS(flags); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchflags_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, &cap_fchflags_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchmod_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, &cap_fchmod_rights, &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error; AUDIT_ARG_OWNER(uid, gid); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchown_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, &cap_fchown_rights, &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error, setbirthtime; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = 0; if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; int error, flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH) | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct vattr vattr; struct nameidata nd; int error; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } VOP_UNLOCK(vp); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; int error, lock_flags; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fsync_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } #ifdef MAC static int kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg, struct nameidata *fromnd) { int error; NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(fromnd)) != 0) return (error); error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp, fromnd->ni_vp, &fromnd->ni_cnd); VOP_UNLOCK(fromnd->ni_dvp); if (fromnd->ni_dvp != fromnd->ni_vp) VOP_UNLOCK(fromnd->ni_vp); if (error != 0) { NDFREE(fromnd, NDF_ONLY_PNBUF); vrele(fromnd->ni_dvp); vrele(fromnd->ni_vp); if (fromnd->ni_startdir) vrele(fromnd->ni_startdir); } return (error); } #endif int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; int error; again: bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg, &fromnd); if (error != 0) return (error); } else { #endif NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC } #endif fvp = fromnd.ni_vp; NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2, pathseg, new, newfd, &cap_renameat_target_rights, td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd', when newfd was used for the lookup. */ error = cap_check(&tond.ni_filecaps.fc_rights, &cap_unlinkat_rights); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | - NOCACHE, segflg, path, fd, &cap_mkdirat_rights, - td); + NC_NOMAKEENTRY | NC_KEEPPOSENTRY, segflg, path, fd, + &cap_mkdirat_rights, td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error == 0) vput(nd.ni_vp); vn_finished_write(mp); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0)); } int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag) { struct mount *mp; struct vnode *vp; struct file *fp; struct nameidata nd; cap_rights_t rights; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), &fp); if (error != 0) return (error); } restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_BENEATH | AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) goto fdout; vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto fdout; goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); fdout: if (fp != NULL) fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_XUNLOCK(fdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (devfs_usecount(vp) > 0) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry and check that, if it * is a capability, the correct rights are present. A reference on the file * entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp)); } /* * syscall for the rpc.lockd to use to translate an open descriptor into * a NFS file handle. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct getfhat_args { int fd; char *path; fhandle_t *fhp; int flags; }; #endif int sys_getfhat(struct thread *td, struct getfhat_args *uap) { if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, uap->fhp)); } static int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW | AT_BENEATH | AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) error = copyout(&fh, fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhlink_args { fhandle_t *fhp; const char *to; }; #endif int sys_fhlink(struct thread *td, struct fhlink_args *uap) { return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct fhlinkat_args { fhandle_t *fhp; int tofd; const char *to; }; #endif int sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) { return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); } static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); error = copyin(fhp, &fh, sizeof(fh)); if (error != 0) return (error); do { bwillwrite(); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); VOP_UNLOCK(vp); } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhreadlink_args { fhandle_t *fhp; char *buf; size_t bufsize; }; #endif int sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); if (uap->bufsize > IOSIZE_MAX) return (EINVAL); error = copyin(uap->fhp, &fh, sizeof(fh)); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); vput(vp); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; finit_vnode(fp, fmode, NULL, &vnops); VOP_UNLOCK(vp); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); out: vfs_unbusy(mp); return (error); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, off_t *outoffp, size_t len, unsigned int flags) { struct file *infp, *outfp; struct vnode *invp, *outvp; int error; size_t retlen; void *rl_rcookie, *rl_wcookie; off_t savinoff, savoutoff; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; savinoff = -1; error = 0; retlen = 0; if (flags != 0) { error = EINVAL; goto out; } if (len > SSIZE_MAX) /* * Although the len argument is size_t, the return argument * is ssize_t (which is signed). Therefore a size that won't * fit in ssize_t can't be returned. */ len = SSIZE_MAX; /* Get the file structures for the file descriptors. */ error = fget_read(td, infd, &cap_read_rights, &infp); if (error != 0) goto out; if (infp->f_ops == &badfileops) { error = EBADF; goto out; } if (infp->f_vnode == NULL) { error = EINVAL; goto out; } error = fget_write(td, outfd, &cap_write_rights, &outfp); if (error != 0) goto out; if (outfp->f_ops == &badfileops) { error = EBADF; goto out; } if (outfp->f_vnode == NULL) { error = EINVAL; goto out; } /* Set the offset pointers to the correct place. */ if (inoffp == NULL) inoffp = &infp->f_offset; if (outoffp == NULL) outoffp = &outfp->f_offset; savinoff = *inoffp; savoutoff = *outoffp; invp = infp->f_vnode; outvp = outfp->f_vnode; /* Sanity check the f_flag bits. */ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || (infp->f_flag & FREAD) == 0) { error = EBADF; goto out; } /* If len == 0, just return 0. */ if (len == 0) goto out; /* * If infp and outfp refer to the same file, the byte ranges cannot * overlap. */ if (invp == outvp && ((savinoff <= savoutoff && savinoff + len > savoutoff) || (savinoff > savoutoff && savoutoff + len > savinoff))) { error = EINVAL; goto out; } /* Range lock the byte ranges for both invp and outvp. */ for (;;) { rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + len); rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + len); if (rl_rcookie != NULL) break; vn_rangelock_unlock(outvp, rl_wcookie); rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); vn_rangelock_unlock(invp, rl_rcookie); } retlen = len; error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, flags, infp->f_cred, outfp->f_cred, td); out: if (rl_rcookie != NULL) vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); if (savinoff != -1 && (error == EINTR || error == ERESTART)) { *inoffp = savinoff; *outoffp = savoutoff; } if (outfp != NULL) fdrop(outfp, td); if (infp != NULL) fdrop(infp, td); td->td_retval[0] = retlen; return (error); } int sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) { off_t inoff, outoff, *inoffp, *outoffp; int error; inoffp = outoffp = NULL; if (uap->inoffp != NULL) { error = copyin(uap->inoffp, &inoff, sizeof(off_t)); if (error != 0) return (error); inoffp = &inoff; } if (uap->outoffp != NULL) { error = copyin(uap->outoffp, &outoff, sizeof(off_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, outoffp, uap->len, uap->flags); if (error == 0 && uap->inoffp != NULL) error = copyout(inoffp, uap->inoffp, sizeof(off_t)); if (error == 0 && uap->outoffp != NULL) error = copyout(outoffp, uap->outoffp, sizeof(off_t)); return (error); } Index: head/sys/kern/vnode_if.src =================================================================== --- head/sys/kern/vnode_if.src (revision 366949) +++ head/sys/kern/vnode_if.src (revision 366950) @@ -1,807 +1,808 @@ #- # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of the University nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 # $FreeBSD$ # # # Above each of the vop descriptors in lines starting with %% # is a specification of the locking protocol used by each vop call. # The first column is the name of the variable, the remaining three # columns are in, out and error respectively. The "in" column defines # the lock state on input, the "out" column defines the state on successful # return, and the "error" column defines the locking state on error exit. # # The locking value can take the following values: # L: locked; not converted to type of lock. # E: locked with exclusive lock for this process. # U: unlocked. # -: not applicable. vnode does not yet (or no longer) exists. # =: the same on input and output, may be either L or U. # # The paramater named "vpp" is assumed to be always used with double # indirection (**vpp) and that name is hard-coded in vnode_if.awk ! # # Lines starting with %! specify a pre or post-condition function # to call before/after the vop call. # # If other such parameters are introduced, they have to be added to # the AWK script at the head of the definition of "add_debug_code()". # vop_islocked { IN struct vnode *vp; }; %% lookup dvp L L L %% lookup vpp - L - # XXX - the lookup locking protocol defies simple description and depends # on the flags and operation fields in the (cnp) structure. Note # especially that *vpp may equal dvp and both may be locked. vop_lookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% cachedlookup dvp L L L %% cachedlookup vpp - L - # This must be an exact copy of lookup. See kern/vfs_cache.c for details. vop_cachedlookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% create dvp E E E %% create vpp - L - %! create pre vop_create_pre %! create post vop_create_post vop_create { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% whiteout dvp E E E %! whiteout pre vop_whiteout_pre %! whiteout post vop_whiteout_post vop_whiteout { IN struct vnode *dvp; IN struct componentname *cnp; IN int flags; }; %% mknod dvp E E E %% mknod vpp - L - %! mknod pre vop_mknod_pre %! mknod post vop_mknod_post vop_mknod { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% open vp L L L %! open post vop_open_post vop_open { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct thread *td; IN struct file *fp; }; %% close vp L L L %! close post vop_close_post vop_close { IN struct vnode *vp; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% fplookup_vexec vp - - - %! fplookup_vexec debugpre vop_fplookup_vexec_debugpre %! fplookup_vexec debugpost vop_fplookup_vexec_debugpost vop_fplookup_vexec { IN struct vnode *vp; IN struct ucred *cred; }; %% access vp L L L vop_access { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% accessx vp L L L vop_accessx { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% stat vp L L L vop_stat { IN struct vnode *vp; OUT struct stat *sb; IN struct ucred *active_cred; IN struct ucred *file_cred; IN struct thread *td; }; %% getattr vp L L L vop_getattr { IN struct vnode *vp; OUT struct vattr *vap; IN struct ucred *cred; }; %% setattr vp E E E %! setattr pre vop_setattr_pre %! setattr post vop_setattr_post vop_setattr { IN struct vnode *vp; IN struct vattr *vap; IN struct ucred *cred; }; %% mmapped vp L L L vop_mmapped { IN struct vnode *vp; }; %% read vp L L L %! read post vop_read_post vop_read { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% read_pgcache vp - - - %! read_pgcache post vop_read_pgcache_post vop_read_pgcache { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% write vp L L L %! write pre VOP_WRITE_PRE %! write post VOP_WRITE_POST vop_write { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% ioctl vp U U U vop_ioctl { IN struct vnode *vp; IN u_long command; IN void *data; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% poll vp U U U vop_poll { IN struct vnode *vp; IN int events; IN struct ucred *cred; IN struct thread *td; }; %% kqfilter vp U U U vop_kqfilter { IN struct vnode *vp; IN struct knote *kn; }; %% revoke vp L L L vop_revoke { IN struct vnode *vp; IN int flags; }; %% fsync vp L L L vop_fsync { IN struct vnode *vp; IN int waitfor; IN struct thread *td; }; %% remove dvp E E E %% remove vp E E E %! remove pre vop_remove_pre %! remove post vop_remove_post vop_remove { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% link tdvp E E E %% link vp E E E %! link pre vop_link_pre %! link post vop_link_post vop_link { IN struct vnode *tdvp; IN struct vnode *vp; IN struct componentname *cnp; }; %! rename pre vop_rename_pre %! rename post vop_rename_post vop_rename { IN WILLRELE struct vnode *fdvp; IN WILLRELE struct vnode *fvp; IN struct componentname *fcnp; IN WILLRELE struct vnode *tdvp; IN WILLRELE struct vnode *tvp; IN struct componentname *tcnp; }; %% mkdir dvp E E E %% mkdir vpp - E - %! mkdir pre vop_mkdir_pre %! mkdir post vop_mkdir_post +%! mkdir debugpost vop_mkdir_debugpost vop_mkdir { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% rmdir dvp E E E %% rmdir vp E E E %! rmdir pre vop_rmdir_pre %! rmdir post vop_rmdir_post vop_rmdir { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% symlink dvp E E E %% symlink vpp - E - %! symlink pre vop_symlink_pre %! symlink post vop_symlink_post vop_symlink { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; IN const char *target; }; %% readdir vp L L L %! readdir post vop_readdir_post vop_readdir { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; INOUT int *eofflag; OUT int *ncookies; INOUT u_long **cookies; }; %% readlink vp L L L vop_readlink { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; }; %% inactive vp E E E vop_inactive { IN struct vnode *vp; }; %! need_inactive debugpre vop_need_inactive_debugpre %! need_inactive debugpost vop_need_inactive_debugpost vop_need_inactive { IN struct vnode *vp; }; %% reclaim vp E E E %! reclaim post vop_reclaim_post vop_reclaim { IN struct vnode *vp; }; %! lock1 debugpre vop_lock_debugpre %! lock1 debugpost vop_lock_debugpost vop_lock1 { IN struct vnode *vp; IN int flags; IN const char *file; IN int line; }; %! unlock debugpre vop_unlock_debugpre vop_unlock { IN struct vnode *vp; }; %% bmap vp L L L vop_bmap { IN struct vnode *vp; IN daddr_t bn; OUT struct bufobj **bop; IN daddr_t *bnp; OUT int *runp; OUT int *runb; }; %% strategy vp L L L %! strategy debugpre vop_strategy_debugpre vop_strategy { IN struct vnode *vp; IN struct buf *bp; }; %% getwritemount vp = = = vop_getwritemount { IN struct vnode *vp; OUT struct mount **mpp; }; %% print vp - - - vop_print { IN struct vnode *vp; }; %% pathconf vp L L L vop_pathconf { IN struct vnode *vp; IN int name; OUT long *retval; }; %% advlock vp U U U vop_advlock { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; }; %% advlockasync vp U U U vop_advlockasync { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; IN struct task *task; INOUT void **cookiep; }; %% advlockpurge vp E E E vop_advlockpurge { IN struct vnode *vp; }; %% reallocblks vp E E E vop_reallocblks { IN struct vnode *vp; IN struct cluster_save *buflist; }; %% getpages vp L L L vop_getpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; }; %% getpages_async vp L L L vop_getpages_async { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; IN vop_getpages_iodone_t *iodone; IN void *arg; }; %% putpages vp L L L vop_putpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int sync; IN int *rtvals; }; %% getacl vp L L L vop_getacl { IN struct vnode *vp; IN acl_type_t type; OUT struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% setacl vp E E E %! setacl pre vop_setacl_pre %! setacl post vop_setacl_post vop_setacl { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% aclcheck vp = = = vop_aclcheck { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% closeextattr vp L L L vop_closeextattr { IN struct vnode *vp; IN int commit; IN struct ucred *cred; IN struct thread *td; }; %% getextattr vp L L L vop_getextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% listextattr vp L L L vop_listextattr { IN struct vnode *vp; IN int attrnamespace; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% openextattr vp L L L vop_openextattr { IN struct vnode *vp; IN struct ucred *cred; IN struct thread *td; }; %% deleteextattr vp E E E %! deleteextattr pre vop_deleteextattr_pre %! deleteextattr post vop_deleteextattr_post vop_deleteextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; IN struct ucred *cred; IN struct thread *td; }; %% setextattr vp E E E %! setextattr pre vop_setextattr_pre %! setextattr post vop_setextattr_post vop_setextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct thread *td; }; %% setlabel vp E E E vop_setlabel { IN struct vnode *vp; IN struct label *label; IN struct ucred *cred; IN struct thread *td; }; %% vptofh vp = = = vop_vptofh { IN struct vnode *vp; IN struct fid *fhp; }; %% vptocnp vp L L L %% vptocnp vpp - U - vop_vptocnp { IN struct vnode *vp; OUT struct vnode **vpp; INOUT char *buf; INOUT size_t *buflen; }; %% allocate vp E E E vop_allocate { IN struct vnode *vp; INOUT off_t *offset; INOUT off_t *len; }; %% advise vp U U U vop_advise { IN struct vnode *vp; IN off_t start; IN off_t end; IN int advice; }; %% unp_bind vp E E E vop_unp_bind { IN struct vnode *vp; IN struct unpcb *unpcb; }; %% unp_connect vp L L L vop_unp_connect { IN struct vnode *vp; OUT struct unpcb **unpcb; }; %% unp_detach vp = = = vop_unp_detach { IN struct vnode *vp; }; %% is_text vp L L L vop_is_text { IN struct vnode *vp; }; %% set_text vp = = = vop_set_text { IN struct vnode *vp; }; %% vop_unset_text vp L L L vop_unset_text { IN struct vnode *vp; }; %% add_writecount vp L L L vop_add_writecount { IN struct vnode *vp; IN int inc; }; %% fdatasync vp L L L vop_fdatasync { IN struct vnode *vp; IN struct thread *td; }; %% copy_file_range invp U U U %% copy_file_range outvp U U U vop_copy_file_range { IN struct vnode *invp; INOUT off_t *inoffp; IN struct vnode *outvp; INOUT off_t *outoffp; INOUT size_t *lenp; IN unsigned int flags; IN struct ucred *incred; IN struct ucred *outcred; IN struct thread *fsizetd; }; # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, # the new VOP should replace one of the spares. vop_spare1 { IN struct vnode *vp; }; vop_spare2 { IN struct vnode *vp; }; vop_spare3 { IN struct vnode *vp; }; vop_spare4 { IN struct vnode *vp; }; vop_spare5 { IN struct vnode *vp; }; Index: head/sys/sys/namei.h =================================================================== --- head/sys/sys/namei.h (revision 366949) +++ head/sys/sys/namei.h (revision 366950) @@ -1,285 +1,288 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)namei.h 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_NAMEI_H_ #define _SYS_NAMEI_H_ #include #include #include #include enum nameiop { LOOKUP, CREATE, DELETE, RENAME }; struct componentname { /* * Arguments to lookup. */ u_int64_t cn_origflags; /* flags to namei */ u_int64_t cn_flags; /* flags to namei */ struct thread *cn_thread;/* thread requesting lookup */ struct ucred *cn_cred; /* credentials */ enum nameiop cn_nameiop; /* namei operation */ int cn_lkflags; /* Lock flags LK_EXCLUSIVE or LK_SHARED */ /* * Shared between lookup and commit routines. */ char *cn_pnbuf; /* pathname buffer */ char *cn_nameptr; /* pointer to looked up name */ long cn_namelen; /* length of looked up component */ }; struct nameicap_tracker; TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); /* * Encapsulation of namei parameters. */ struct nameidata { /* * Arguments to namei/lookup. */ const char *ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ cap_rights_t *ni_rightsneeded; /* rights required to look up vnode */ /* * Arguments to lookup. */ struct vnode *ni_startdir; /* starting directory */ struct vnode *ni_rootdir; /* logical root directory */ struct vnode *ni_topdir; /* logical top directory */ int ni_dirfd; /* starting directory for *at functions */ int ni_lcf; /* local call flags */ /* * Results: returned from namei */ struct filecaps ni_filecaps; /* rights the *at base has */ /* * Results: returned from/manipulated by lookup */ struct vnode *ni_vp; /* vnode of result */ struct vnode *ni_dvp; /* vnode of intermediate directory */ /* * Results: flags returned from namei */ u_int ni_resflags; /* * Shared between namei and lookup/commit routines. */ size_t ni_pathlen; /* remaining chars in path */ char *ni_next; /* next location in pathname */ u_int ni_loopcnt; /* count of symlinks encountered */ /* * Lookup parameters: this structure describes the subset of * information from the nameidata structure that is passed * through the VOP interface. */ struct componentname ni_cnd; struct nameicap_tracker_head ni_cap_tracker; struct vnode *ni_beneath_latch; }; #ifdef _KERNEL enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL, CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET }; int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp); /* * Flags for namei. * * If modifying the list make sure to check whether NDVALIDATE needs updating. */ /* * namei operational modifier flags, stored in ni_cnd.flags */ +#define NC_NOMAKEENTRY 0x0001 /* name must not be added to cache */ +#define NC_KEEPPOSENTRY 0x0002 /* don't evict a positive entry */ +#define NOCACHE NC_NOMAKEENTRY /* for compatibility with older code */ #define LOCKLEAF 0x0004 /* lock vnode on return */ #define LOCKPARENT 0x0008 /* want parent vnode returned locked */ #define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ -#define NOCACHE 0x0020 /* name must not be left in cache */ +/* UNUSED 0x0020 */ #define FOLLOW 0x0040 /* follow symbolic links */ #define BENEATH 0x0080 /* No escape from the start dir */ #define LOCKSHARED 0x0100 /* Shared lock leaf */ #define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ #define RBENEATH 0x100000000ULL /* No escape, even tmp, from start dir */ -#define MODMASK 0xf000001fcULL /* mask of operational modifiers */ +#define MODMASK 0xf000001ffULL /* mask of operational modifiers */ /* * Namei parameter descriptors. * * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. * If the caller of namei sets the flag (for example execve wants to * know the name of the program that is being executed), then it must * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must * be freed by either the commit routine or the VOP_ABORT routine. * SAVESTART is set only by the callers of namei. It implies SAVENAME * plus the addition of saving the parent directory that contains the * name in ni_startdir. It allows repeated calls to lookup for the * name being sought. The caller is responsible for releasing the * buffer and for vrele'ing ni_startdir. */ #define RDONLY 0x00000200 /* lookup with read-only semantics */ #define SAVENAME 0x00000400 /* save pathname buffer */ #define SAVESTART 0x00000800 /* save starting directory */ #define ISWHITEOUT 0x00001000 /* found whiteout */ #define DOWHITEOUT 0x00002000 /* do whiteouts */ #define WILLBEDIR 0x00004000 /* new files will be dirs; allow trailing / */ #define ISOPEN 0x00008000 /* caller is opening; return a real vnode. */ #define NOCROSSMOUNT 0x00010000 /* do not cross mount points */ #define NOMACCHECK 0x00020000 /* do not perform MAC checks */ #define AUDITVNODE1 0x00040000 /* audit the looked up vnode information */ #define AUDITVNODE2 0x00080000 /* audit the looked up vnode information */ #define NOCAPCHECK 0x00100000 /* do not perform capability checks */ /* UNUSED 0x00200000 */ /* UNUSED 0x00400000 */ /* UNUSED 0x00800000 */ #define HASBUF 0x01000000 /* has allocated pathname buffer */ #define NOEXECCHECK 0x02000000 /* do not perform exec check on dir */ #define MAKEENTRY 0x04000000 /* entry is to be added to name cache */ #define ISSYMLINK 0x08000000 /* symlink needs interpretation */ #define ISLASTCN 0x10000000 /* this is last component of pathname */ #define ISDOTDOT 0x20000000 /* current component name is .. */ #define TRAILINGSLASH 0x40000000 /* path ended in a slash */ #define PARAMASK 0x7ffffe00 /* mask of parameter descriptors */ /* * Flags which must not be passed in by callers. */ #define NAMEI_INTERNAL_FLAGS \ (HASBUF | NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \ TRAILINGSLASH) /* * Namei results flags */ #define NIRES_ABS 0x00000001 /* Path was absolute */ #define NIRES_STRICTREL 0x00000002 /* Restricted lookup result */ /* * Flags in ni_lcf, valid for the duration of the namei call. */ #define NI_LCF_STRICTRELATIVE 0x0001 /* relative lookup only */ #define NI_LCF_CAP_DOTDOT 0x0002 /* ".." in strictrelative case */ #define NI_LCF_BENEATH_ABS 0x0004 /* BENEATH with absolute path */ #define NI_LCF_BENEATH_LATCHED 0x0008 /* BENEATH_ABS traversed starting dir */ #define NI_LCF_LATCH 0x0010 /* ni_beneath_latch valid */ /* * Initialization of a nameidata structure. */ #define NDINIT(ndp, op, flags, segflg, namep, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, &cap_no_rights, td) #define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, &cap_no_rights, td) #define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td) #define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, &cap_no_rights, td) /* * Note the constant pattern may *hide* bugs. */ #ifdef INVARIANTS #define NDINIT_PREFILL(arg) memset(arg, 0xff, sizeof(*arg)) #else #define NDINIT_PREFILL(arg) do { } while (0) #endif #define NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, startdir, rightsp, td) \ do { \ struct nameidata *_ndp = (ndp); \ cap_rights_t *_rightsp = (rightsp); \ MPASS(_rightsp != NULL); \ NDINIT_PREFILL(_ndp); \ _ndp->ni_cnd.cn_nameiop = op; \ _ndp->ni_cnd.cn_flags = flags; \ _ndp->ni_segflg = segflg; \ _ndp->ni_dirp = namep; \ _ndp->ni_dirfd = dirfd; \ _ndp->ni_startdir = startdir; \ _ndp->ni_resflags = 0; \ filecaps_init(&_ndp->ni_filecaps); \ _ndp->ni_cnd.cn_thread = td; \ _ndp->ni_rightsneeded = _rightsp; \ } while (0) #define NDF_NO_DVP_RELE 0x00000001 #define NDF_NO_DVP_UNLOCK 0x00000002 #define NDF_NO_DVP_PUT 0x00000003 #define NDF_NO_VP_RELE 0x00000004 #define NDF_NO_VP_UNLOCK 0x00000008 #define NDF_NO_VP_PUT 0x0000000c #define NDF_NO_STARTDIR_RELE 0x00000010 #define NDF_NO_FREE_PNBUF 0x00000020 #define NDF_ONLY_PNBUF (~NDF_NO_FREE_PNBUF) void NDFREE_PNBUF(struct nameidata *); void NDFREE(struct nameidata *, const u_int); #define NDFREE(ndp, flags) do { \ struct nameidata *_ndp = (ndp); \ if (__builtin_constant_p(flags) && flags == NDF_ONLY_PNBUF) \ NDFREE_PNBUF(_ndp); \ else \ NDFREE(_ndp, flags); \ } while (0) #ifdef INVARIANTS void NDVALIDATE(struct nameidata *); #else #define NDVALIDATE(ndp) do { } while (0) #endif int namei(struct nameidata *ndp); int lookup(struct nameidata *ndp); int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); #endif /* * Stats on usefulness of namei caches. */ struct nchstats { long ncs_goodhits; /* hits that we can really use */ long ncs_neghits; /* negative hits that we can use */ long ncs_badhits; /* hits we must drop */ long ncs_falsehits; /* hits with id mismatch */ long ncs_miss; /* misses */ long ncs_long; /* long names that ignore cache */ long ncs_pass2; /* names found with passes == 2 */ long ncs_2passes; /* number of times we attempt it */ }; extern struct nchstats nchstats; #endif /* !_SYS_NAMEI_H_ */ Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h (revision 366949) +++ head/sys/sys/vnode.h (revision 366950) @@ -1,1083 +1,1094 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ seqc_t v_seqc; /* i modification count */ uint32_t v_nchash; /* u namecache hash */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. */ union { struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ struct cdev *v_rdev; /* v device (VCHR, VBLK) */ struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */ TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_short v_iflag; /* i vnode flags (see below) */ u_short v_vflag; /* v vnode flags */ u_short v_mflag; /* l mnt-specific vnode flags */ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ int v_seqc_users; /* i modifications pending */ u_int v_hash; }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted, never cleared once set */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ #define VI_DOINGINACT 0x0004 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x0008 /* Need to call inactive */ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ u_short va_padding0; uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ nlink_t va_nlink; /* number of references to file */ dev_t va_fsid; /* filesystem id */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; extern const u_int io_hold_cnt; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ #define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern u_long desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ int vdesc_vop_offset; vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #define ASSERT_VOP_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_IN_SEQC(vp) ((void)0) #define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 #define VN_OPEN_INVFS 0x00000008 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct freebsd11_stat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(u_long newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_vnode_init(struct vnode *vp); void cache_purge(struct vnode *vp); void cache_purge_vgone(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp); void cache_purgevfs(struct mount *mp); +#ifdef INVARIANTS +void cache_validate(struct vnode *dvp, struct vnode *vp, + struct componentname *cnp); +#else +static inline void +cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) +{ +} +#endif int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(void); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen); int vn_getcwd(char *buf, char **retbuf, size_t *buflen); int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf); int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred); int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred); void vattr_null(struct vattr *vap); void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vget_finish_ref(struct vnode *vp, enum vgetstate vs); void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdnz(struct vnode *); bool vhold_smr(struct vnode *); void vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_need_pageq_flush(struct vnode *vp); bool vn_isdisk_error(struct vnode *vp, int *errp); bool vn_isdisk(struct vnode *vp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_read_from_obj(struct vnode *vp, struct uio *uio); int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); void vn_seqc_write_begin_unheld_locked(struct vnode *vp); void vn_seqc_write_begin_unheld(struct vnode *vp); void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp); void vn_seqc_write_end(struct vnode *vp); #define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_tryrlock(vp, start, end) \ rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_trywlock(vp, start, end) \ rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_lock(struct vop_lock1_args *); int vop_unlock(struct vop_unlock_args *); int vop_islocked(struct vop_islocked_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_eagain(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_read_pgcache_post(void *ap, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); void vop_setacl_pre(void *a); void vop_setacl_post(void *a, int rc); void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *a); void vop_fplookup_vexec_debugpost(void *a, int rc); void vop_strategy_debugpre(void *a); void vop_lock_debugpre(void *a); void vop_lock_debugpost(void *a, int rc); void vop_unlock_debugpre(void *a); void vop_need_inactive_debugpre(void *a); void vop_need_inactive_debugpost(void *a, int rc); +void vop_mkdir_debugpost(void *a, int rc); #else #define vop_fplookup_vexec_debugpre(x) do { } while (0) #define vop_fplookup_vexec_debugpost(x, y) do { } while (0) #define vop_strategy_debugpre(x) do { } while (0) #define vop_lock_debugpre(x) do { } while (0) #define vop_lock_debugpost(x, y) do { } while (0) #define vop_unlock_debugpre(x) do { } while (0) #define vop_need_inactive_debugpre(x) do { } while (0) #define vop_need_inactive_debugpost(x, y) do { } while (0) +#define vop_mkdir_debugpost(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define vop_stat_helper_pre(ap) ({ \ int _error; \ AUDIT_ARG_VNODE1(ap->a_vp); \ _error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\ if (__predict_true(_error == 0)) \ bzero(ap->a_sb, sizeof(*ap->a_sb)); \ _error; \ }) #define vop_stat_helper_post(ap, error) ({ \ int _error = (error); \ if (priv_check_cred_vfs_generation(ap->a_td->td_ucred)) \ ap->a_sb->st_gen = 0; \ _error; \ }) #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) #ifdef INVARIANTS #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ do { \ int error_; \ \ error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ error_)); \ } while (0) #define VOP_SET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_SET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ error_)); \ } while (0) #define VOP_UNSET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_UNSET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ error_)); \ } while (0) #else #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) #define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) #define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) #endif #define VN_IS_DOOMED(vp) __predict_false((vp)->v_irflag & VIRF_DOOMED) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefact(struct vnode *vp); void v_addpollinfo(struct vnode *vp); static __inline int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } #define vholdl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vhold(vp); \ } while (0) #define vrefl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vref(vp); \ } while (0) int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) #define VOP_EAGAIN ((void*)(uintptr_t)vop_eagain) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(u_long newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); void vn_fsid(struct vnode *vp, struct vattr *va); int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); #define VOP_UNLOCK_FLAGS(vp, flags) ({ \ struct vnode *_vp = (vp); \ int _flags = (flags); \ int _error; \ \ if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0) \ panic("%s: unsupported flags %x\n", __func__, flags); \ _error = VOP_UNLOCK(_vp); \ if (_flags & LK_INTERLOCK) \ VI_UNLOCK(_vp); \ _error; \ }) #include #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) #define VFS_SMR_DECLARE \ extern smr_t vfs_smr #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) #define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #define vn_load_v_data_smr(vp) ({ \ struct vnode *_vp = (vp); \ \ VFS_SMR_ASSERT_ENTERED(); \ atomic_load_ptr(&(_vp)->v_data); \ }) #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */