diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c index fea34273baef..61a59be9f78b 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c @@ -1,1845 +1,1849 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2011 Martin Matuska */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" /* Used by fstat(1). */ +#ifdef SYSCTL_SIZEOF +SYSCTL_SIZEOF(znode, znode_t); +#else SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); +#endif /* * Define ZNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef ZFS_DEBUG #define ZNODE_STATS #endif /* DEBUG */ #ifdef ZNODE_STATS #define ZNODE_STAT_ADD(stat) ((stat)++) #else #define ZNODE_STAT_ADD(stat) /* nothing */ #endif /* ZNODE_STATS */ #if !defined(KMEM_DEBUG) #define _ZFS_USE_SMR static uma_zone_t znode_uma_zone; #else static kmem_cache_t *znode_cache = NULL; #endif extern struct vop_vector zfs_vnodeops; extern struct vop_vector zfs_fifoops; extern struct vop_vector zfs_shareops; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { znode_t *zp = buf; POINTER_INVALIDATE(&zp->z_zfsvfs); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); ASSERT3P(zp->z_vnode, ==, NULL); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } #ifdef _ZFS_USE_SMR VFS_SMR_DECLARE; static int zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, int flags) { return (zfs_znode_cache_constructor(mem, private, flags)); } static void zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) { zfs_znode_cache_destructor(mem, private); } void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_uma_zone, ==, NULL); znode_uma_zone = uma_zcreate("zfs_znode_cache", sizeof (znode_t), zfs_znode_cache_constructor_smr, zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); VFS_SMR_ZONE_SET(znode_uma_zone); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (uma_zalloc_smr(znode_uma_zone, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } uma_zfree_smr(znode_uma_zone, zp); } #else void zfs_znode_init(void) { /* * Initialize zcache */ ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); } static znode_t * zfs_znode_alloc_kmem(int flags) { return (kmem_cache_alloc(znode_cache, flags)); } static void zfs_znode_free_kmem(znode_t *zp) { if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } #endif void zfs_znode_fini(void) { /* * Cleanup zcache */ #ifdef _ZFS_USE_SMR if (znode_uma_zone) { uma_zdestroy(znode_uma_zone); znode_uma_zone = NULL; } #else if (znode_cache) { kmem_cache_destroy(znode_cache); znode_cache = NULL; } #endif } static int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { zfs_acl_ids_t acl_ids; vattr_t vattr; znode_t *sharezp; znode_t *zp; int error; vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0555; vattr.va_uid = crgetuid(kcred); vattr.va_gid = crgetgid(kcred); sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; sharezp->z_is_sa = zfsvfs->z_use_sa; VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, kcred, NULL, &acl_ids, NULL)); zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, sharezp); POINTER_INVALIDATE(&sharezp->z_zfsvfs); error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); zfsvfs->z_shares_dir = sharezp->z_id; zfs_acl_ids_free(&acl_ids); sa_handle_destroy(sharezp->z_sa_hdl); zfs_znode_free_kmem(sharezp); return (error); } /* * define a couple of values we need available * for both 64 and 32 bit environments. */ #ifndef NBITSMINOR64 #define NBITSMINOR64 32 #endif #ifndef MAXMAJ64 #define MAXMAJ64 0xffffffffUL #endif #ifndef MAXMIN64 #define MAXMIN64 0xffffffffUL #endif /* * Create special expldev for ZFS private use. * Can't use standard expldev since it doesn't do * what we want. The standard expldev() takes a * dev32_t in LP64 and expands it to a long dev_t. * We need an interface that takes a dev32_t in ILP32 * and expands it to a long dev_t. */ static uint64_t zfs_expldev(dev_t dev) { return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); } /* * Special cmpldev for ZFS private use. * Can't use standard cmpldev since it takes * a long dev_t and compresses it to dev32_t in * LP64. We need to do a compaction of a long dev_t * to a dev32_t in ILP32. */ dev_t zfs_cmpldev(uint64_t dev) { return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); ASSERT3P(zp->z_sa_hdl, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); if (sa_hdl == NULL) { VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; /* * Slap on VROOT if we are the root znode unless we are the root * node of a snapshot mounted under .zfs. */ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) ZTOV(zp)->v_flag |= VROOT; vn_exists(ZTOV(zp)); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } static void zfs_vnode_forget(vnode_t *vp) { /* copied from insmntque_stddtr */ vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Construct a new znode/vnode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; vnode_t *vp; uint64_t mode; uint64_t parent; #ifdef notyet uint64_t mtime[2], ctime[2]; #endif uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[9]; int count = 0; int error; zp = zfs_znode_alloc_kmem(KM_SLEEP); #ifndef _ZFS_USE_SMR KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, ("%s: fast path lookup enabled without smr", __func__)); #endif KASSERT(curthread->td_vp_reserved != NULL, ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); if (error != 0) { zfs_znode_free_kmem(zp); return (NULL); } zp->z_vnode = vp; vp->v_data = zp; /* * Acquire the vnode lock before any possible interaction with the * outside world. Specifically, there is an error path that calls * zfs_vnode_forget() and the vnode should be exclusively locked. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); zp->z_sa_hdl = NULL; zp->z_unlinked = 0; zp->z_atime_dirty = 0; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; atomic_store_ptr(&zp->z_cached_symlink, NULL); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, 16); #ifdef notyet SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); #endif SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, 8); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zfs_vnode_forget(vp); zp->z_vnode = NULL; zfs_znode_free_kmem(zp); return (NULL); } zp->z_projid = projid; zp->z_mode = mode; /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; vp->v_type = IFTOVT((mode_t)mode); switch (vp->v_type) { case VDIR: zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ break; case VFIFO: vp->v_op = &zfs_fifoops; break; case VREG: if (parent == zfsvfs->z_shares_dir) { ASSERT0(zp->z_uid); ASSERT0(zp->z_gid); vp->v_op = &zfs_shareops; } break; default: break; } mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1400077 vn_set_state(vp, VSTATE_CONSTRUCTED); #endif VN_LOCK_AREC(vp); if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); return (zp); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. * * OUT: zpp - allocated znode * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t dzp_pflags = 0; uint64_t rdev = 0; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; dmu_buf_t *db; timestruc_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; ASSERT3P(vap, !=, NULL); ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE); if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; vfs_timestamp(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (vap->va_type == VDIR) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } else { dzp_pflags = dzp->z_pflags; } /* * If parent is an xattr, so am I. */ if (dzp_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (vap->va_type == VDIR) { size = 2; /* contents ("." and "..") */ links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; } else { size = links = 0; } if (vap->va_type == VBLK || vap->va_type == VCHR) { rdev = zfs_expldev(vap->va_rdev); } parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } if (obj_type == DMU_OT_ZNODE || (vap->va_type == VBLK || vap->va_type == VCHR)) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx)); if (!(flag & IS_ROOT_NODE)) { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); ASSERT3P(*zpp, !=, NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = mode; (*zpp)->z_dnodesize = dnodesize; if (vap->va_mask & AT_XVATTR) zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } if (!(flag & IS_ROOT_NODE)) { vnode_t *vp = ZTOV(*zpp); vp->v_vflag |= VV_FORCEINSMQ; int err = insmntque(vp, zfsvfs->z_vfs); vp->v_vflag &= ~VV_FORCEINSMQ; (void) err; KASSERT(err == 0, ("insmntque() failed: error %d", err)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; xoap = xva_getxoptattr(xvap); ASSERT3P(xoap, !=, NULL); if (zp->z_zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_IN_SEQC(ZTOV(zp)); } if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; vnode_t *vp; sa_handle_t *hdl; int locked; int err; getnewvnode_reserve(); again: *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { vp = ZTOV(zp); /* * Don't let the vnode disappear after * ZFS_OBJ_HOLD_EXIT. */ VN_HOLD(vp); *zpp = zp; err = 0; } sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); if (err) { getnewvnode_drop_reserve(); return (err); } locked = VOP_ISLOCKED(vp); VI_LOCK(vp); if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { /* * The vnode is doomed and this thread doesn't * hold the exclusive lock on it, so the vnode * must be being reclaimed by another thread. * Otherwise the doomed vnode is being reclaimed * by this thread and zfs_zget is called from * ZIL internals. */ VI_UNLOCK(vp); /* * XXX vrele() locks the vnode when the last reference * is dropped. Although in this case the vnode is * doomed / dead and so no inactivation is required, * the vnode lock is still acquired. That could result * in a LOR with z_teardown_lock if another thread holds * the vnode's lock and tries to take z_teardown_lock. * But that is only possible if the other thread peforms * a ZFS vnode operation on the vnode. That either * should not happen if the vnode is dead or the thread * should also have a reference to the vnode and thus * our reference is not last. */ VN_RELE(vp); goto again; } VI_UNLOCK(vp); getnewvnode_drop_reserve(); return (err); } /* * Not found create new znode/vnode * but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } if (err == 0) { vnode_t *vp = ZTOV(zp); err = insmntque(vp, zfsvfs->z_vfs); if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK(vp); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); zfs_znode_free(zp); *zpp = NULL; } } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); getnewvnode_drop_reserve(); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_object_info_t doi; dmu_buf_t *db; vnode_t *vp; uint64_t obj_num = zp->z_id; uint64_t mode, size; sa_bulk_attr_t bulk[8]; int err; int count = 0; uint64_t gen; /* * Remove cached pages before reloading the znode, so that they are not * lingering after we run into any error. Ideally, we should vgone() * the vnode in case of error, but currently we cannot do that * because of the LOR between the vnode lock and z_teardown_lock. * So, instead, we have to "doom" the znode in the illumos style. * * Ignore invalid pages during the scan. This is to avoid deadlocks * between page busying and the teardown lock, as pages are busied prior * to a VOP_GETPAGES operation, which acquires the teardown read lock. * Such pages will be invalid and can safely be skipped here. */ vp = ZTOV(zp); #if __FreeBSD_version >= 1400042 vn_pages_remove_valid(vp, 0, 0); #else vn_pages_remove(vp, 0, 0); #endif ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT3P(zp->z_sa_hdl, ==, NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); size = zp->z_size; /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &zp->z_uid, sizeof (zp->z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &zp->z_gid, sizeof (zp->z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } zp->z_mode = mode; if (gen != zp->z_gen) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * It is highly improbable but still quite possible that two * objects in different datasets are created with the same * object numbers and in transaction groups with the same * numbers. znodes corresponding to those objects would * have the same z_id and z_gen, but their other attributes * may be different. * zfs recv -F may replace one of such objects with the other. * As a result file properties recorded in the replaced * object's vnode may no longer match the received object's * properties. At present the only cached property is the * files type recorded in v_type. * So, handle this case by leaving the old vnode and znode * disassociated from the actual object. A new vnode and a * znode will be created if the object is accessed * (e.g. via a look-up). The old vnode and znode will be * recycled when the last vnode reference is dropped. */ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (SET_ERROR(EIO)); } /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatically removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (zp->z_links == 0); if (zp->z_unlinked) { zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } zp->z_blksz = doi.doi_data_block_size; if (zp->z_size != size) vnode_pager_setsize(vp, zp->z_size); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY0(dmu_object_free(os, acl_obj, tx)); } VERIFY0(dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t z_id = zp->z_id; ASSERT3P(zp->z_sa_hdl, !=, NULL); /* * Don't allow a zfs_zget() while were trying to release this znode */ ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_rmnode(zp); return; } } zfs_znode_dmu_fini(zp); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); zfs_znode_free(zp); } void zfs_znode_free(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; char *symlink; ASSERT3P(zp->z_sa_hdl, ==, NULL); zp->z_vnode = NULL; mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); symlink = atomic_load_ptr(&zp->z_cached_symlink); if (symlink != NULL) { atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL); cache_symlink_free(symlink, strlen(symlink) + 1); } if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } zfs_znode_free_kmem(zp); } void zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2], boolean_t have_tx) { timestruc_t now; vfs_timestamp(&now); if (have_tx) { /* will sa_bulk_update happen really soon? */ zp->z_atime_dirty = 0; zp->z_seq++; } else { zp->z_atime_dirty = 1; } if (flag & AT_ATIME) { ZFS_TIME_ENCODE(&now, zp->z_atime); } if (flag & AT_MTIME) { ZFS_TIME_ENCODE(&now, mtime); if (zp->z_zfsvfs->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & AT_CTIME) { ZFS_TIME_ENCODE(&now, ctime); if (zp->z_zfsvfs->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); vnode_pager_setsize(ZTOV(zp), end); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); if (error == 0) { #if __FreeBSD_version >= 1400032 vnode_pager_purge_range(ZTOV(zp), off, off + len); #else /* * Before __FreeBSD_version 1400032 we cannot free block in the * middle of a file, but only at the end of a file, so this code * path should never happen. */ vnode_pager_setsize(ZTOV(zp), off); #endif } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); dmu_tx_commit(tx); /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ vnode_pager_setsize(vp, end); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; else return (error); } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) return (error); log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); return (0); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; int i; znode_t *rootzp = NULL; zfsvfs_t *zfsvfs; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT0(error); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64); val = fnvpair_value_uint64(elem); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT0(error); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT3U(version, !=, 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT0(error); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT0(error); /* * Create root znode. Create minimal znode/vnode/zfsvfs * to allow zfs_mknode to work. */ VATTR_NULL(&vattr); vattr.va_mask = AT_MODE|AT_UID|AT_GID; vattr.va_type = VDIR; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT0(error); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); rootzp->z_zfsvfs = zfsvfs; VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, NULL)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT0(error); zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); sa_handle_destroy(rootzp->z_sa_hdl); zfs_znode_free_kmem(rootzp); /* * Create shares directory */ error = zfs_create_share_dir(zfsvfs, tx); ASSERT0(error); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } void zfs_znode_update_vfs(znode_t *zp) { vm_object_t object; if ((object = ZTOV(zp)->v_object) == NULL || zp->z_size == object->un_pager.vnp.vnp_size) return; vnode_pager_setsize(ZTOV(zp), zp->z_size); } int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint64_t parent; int is_xattrdir; int err; /* Extended attributes should not be visible as regular files. */ if ((zp->z_pflags & ZFS_XATTR) != 0) return (SET_ERROR(EINVAL)); err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, &parent, &is_xattrdir); if (err != 0) return (err); ASSERT0(is_xattrdir); /* No name as this is a root object. */ if (parent == zp->z_id) return (SET_ERROR(EINVAL)); err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), buf); if (err != 0) return (err); err = zfs_zget(zfsvfs, parent, dzpp); return (err); } int zfs_rlimit_fsize(off_t fsize) { struct thread *td = curthread; off_t lim; if (td == NULL) return (0); lim = lim_cur(td, RLIMIT_FSIZE); if (__predict_true((uoff_t)fsize <= lim)) return (0); /* * The limit is reached. */ PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } diff --git a/sys/fs/devfs/devfs_devs.c b/sys/fs/devfs/devfs_devs.c index c6dcd4fc7646..124f9f0449af 100644 --- a/sys/fs/devfs/devfs_devs.c +++ b/sys/fs/devfs/devfs_devs.c @@ -1,754 +1,751 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000,2004 * Poul-Henning Kamp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vfsops.c 1.36 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The one true (but secret) list of active devices in the system. * Locked by dev_lock()/devmtx */ struct cdev_priv_list cdevp_list = TAILQ_HEAD_INITIALIZER(cdevp_list); struct unrhdr *devfs_inos; static MALLOC_DEFINE(M_DEVFS2, "DEVFS2", "DEVFS data 2"); static MALLOC_DEFINE(M_DEVFS3, "DEVFS3", "DEVFS data 3"); static MALLOC_DEFINE(M_CDEVP, "DEVFS1", "DEVFS cdev_priv storage"); SYSCTL_NODE(_vfs, OID_AUTO, devfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "DEVFS filesystem"); static unsigned devfs_generation; SYSCTL_UINT(_vfs_devfs, OID_AUTO, generation, CTLFLAG_RD, &devfs_generation, 0, "DEVFS generation number"); unsigned devfs_rule_depth = 1; SYSCTL_UINT(_vfs_devfs, OID_AUTO, rule_depth, CTLFLAG_RW, &devfs_rule_depth, 0, "Max depth of ruleset include"); /* * Helper sysctl for devname(3). We're given a dev_t and return the * name, if any, registered by the device driver. */ static int sysctl_devname(SYSCTL_HANDLER_ARGS) { int error; dev_t ud; #ifdef COMPAT_FREEBSD11 uint32_t ud_compat; #endif struct cdev_priv *cdp; struct cdev *dev; if (req->newptr == NULL) return (EINVAL); #ifdef COMPAT_FREEBSD11 if (req->newlen == sizeof(ud_compat)) { error = SYSCTL_IN(req, &ud_compat, sizeof(ud_compat)); if (error == 0) ud = ud_compat == (uint32_t)NODEV ? NODEV : ud_compat; } else #endif error = SYSCTL_IN(req, &ud, sizeof (ud)); if (error) return (error); if (ud == NODEV) return (EINVAL); dev = NULL; dev_lock(); TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) if (cdp->cdp_inode == ud) { dev = &cdp->cdp_c; dev_refl(dev); break; } dev_unlock(); if (dev == NULL) return (ENOENT); error = SYSCTL_OUT(req, dev->si_name, strlen(dev->si_name) + 1); dev_rel(dev); return (error); } SYSCTL_PROC(_kern, OID_AUTO, devname, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE, NULL, 0, sysctl_devname, "", "devname(3) handler"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct cdev), "sizeof(struct cdev)"); - -SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev_priv, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct cdev_priv), "sizeof(struct cdev_priv)"); +SYSCTL_SIZEOF_STRUCT(cdev); +SYSCTL_SIZEOF_STRUCT(cdev_priv); struct cdev * devfs_alloc(int flags) { struct cdev_priv *cdp; struct cdev *cdev; struct timespec ts; cdp = malloc(sizeof *cdp, M_CDEVP, M_ZERO | ((flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK)); if (cdp == NULL) return (NULL); mtx_init(&cdp->cdp_threadlock, "devthrd", NULL, MTX_DEF); cdp->cdp_dirents = &cdp->cdp_dirent0; cdev = &cdp->cdp_c; LIST_INIT(&cdev->si_children); vfs_timestamp(&ts); cdev->si_atime = cdev->si_mtime = cdev->si_ctime = ts; return (cdev); } int devfs_dev_exists(const char *name) { struct cdev_priv *cdp; dev_lock_assert_locked(); TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) { if ((cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (devfs_pathpath(cdp->cdp_c.si_name, name) != 0) return (1); if (devfs_pathpath(name, cdp->cdp_c.si_name) != 0) return (1); } if (devfs_dir_find(name) != 0) return (1); return (0); } void devfs_free(struct cdev *cdev) { struct cdev_priv *cdp; cdp = cdev2priv(cdev); KASSERT((cdp->cdp_flags & (CDP_ACTIVE | CDP_ON_ACTIVE_LIST)) == 0, ("%s: cdp %p (%s) still on active list", __func__, cdp, cdev->si_name)); if (cdev->si_cred != NULL) crfree(cdev->si_cred); devfs_free_cdp_inode(cdp->cdp_inode); if (cdp->cdp_maxdirent > 0) free(cdp->cdp_dirents, M_DEVFS2); mtx_destroy(&cdp->cdp_threadlock); free(cdp, M_CDEVP); } struct devfs_dirent * devfs_find(struct devfs_dirent *dd, const char *name, int namelen, int type) { struct devfs_dirent *de; TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (namelen != de->de_dirent->d_namlen) continue; if (type != 0 && type != de->de_dirent->d_type) continue; /* * The race with finding non-active name is not * completely closed by the check, but it is similar * to the devfs_allocv() in making it unlikely enough. */ if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(name, de->de_dirent->d_name, namelen) != 0) continue; break; } KASSERT(de == NULL || (de->de_flags & DE_DOOMED) == 0, ("devfs_find: returning a doomed entry")); return (de); } struct devfs_dirent * devfs_newdirent(char *name, int namelen) { int i; struct devfs_dirent *de; struct dirent d; d.d_namlen = namelen; i = sizeof(*de) + GENERIC_DIRSIZ(&d); de = malloc(i, M_DEVFS3, M_WAITOK | M_ZERO); de->de_dirent = (struct dirent *)(de + 1); de->de_dirent->d_namlen = namelen; de->de_dirent->d_reclen = GENERIC_DIRSIZ(&d); bcopy(name, de->de_dirent->d_name, namelen); dirent_terminate(de->de_dirent); vfs_timestamp(&de->de_ctime); de->de_mtime = de->de_atime = de->de_ctime; de->de_links = 1; de->de_holdcnt = 1; #ifdef MAC mac_devfs_init(de); #endif return (de); } struct devfs_dirent * devfs_parent_dirent(struct devfs_dirent *de) { if (de->de_dirent->d_type != DT_DIR) return (de->de_dir); if (de->de_flags & (DE_DOT | DE_DOTDOT)) return (NULL); de = TAILQ_FIRST(&de->de_dlist); /* "." */ if (de == NULL) return (NULL); de = TAILQ_NEXT(de, de_list); /* ".." */ if (de == NULL) return (NULL); return (de->de_dir); } struct devfs_dirent * devfs_vmkdir(struct devfs_mount *dmp, char *name, int namelen, struct devfs_dirent *dotdot, u_int inode) { struct devfs_dirent *dd; struct devfs_dirent *de; /* Create the new directory */ dd = devfs_newdirent(name, namelen); TAILQ_INIT(&dd->de_dlist); dd->de_dirent->d_type = DT_DIR; dd->de_mode = 0555; dd->de_links = 2; dd->de_dir = dd; if (inode != 0) dd->de_inode = inode; else dd->de_inode = alloc_unr(devfs_inos); /* * "." and ".." are always the two first entries in the * de_dlist list. * * Create the "." entry in the new directory. */ de = devfs_newdirent(".", 1); de->de_dirent->d_type = DT_DIR; de->de_flags |= DE_DOT; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); de->de_dir = dd; /* Create the ".." entry in the new directory. */ de = devfs_newdirent("..", 2); de->de_dirent->d_type = DT_DIR; de->de_flags |= DE_DOTDOT; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); if (dotdot == NULL) { de->de_dir = dd; } else { de->de_dir = dotdot; sx_assert(&dmp->dm_lock, SX_XLOCKED); TAILQ_INSERT_TAIL(&dotdot->de_dlist, dd, de_list); dotdot->de_links++; devfs_rules_apply(dmp, dd); } #ifdef MAC mac_devfs_create_directory(dmp->dm_mount, name, namelen, dd); #endif return (dd); } void devfs_dirent_free(struct devfs_dirent *de) { struct vnode *vp; vp = de->de_vnode; mtx_lock(&devfs_de_interlock); if (vp != NULL && vp->v_data == de) vp->v_data = NULL; mtx_unlock(&devfs_de_interlock); free(de, M_DEVFS3); } /* * Removes a directory if it is empty. Also empty parent directories are * removed recursively. */ static void devfs_rmdir_empty(struct devfs_mount *dm, struct devfs_dirent *de) { struct devfs_dirent *dd, *de_dot, *de_dotdot; sx_assert(&dm->dm_lock, SX_XLOCKED); for (;;) { KASSERT(de->de_dirent->d_type == DT_DIR, ("devfs_rmdir_empty: de is not a directory")); if ((de->de_flags & DE_DOOMED) != 0 || de == dm->dm_rootdir) return; de_dot = TAILQ_FIRST(&de->de_dlist); KASSERT(de_dot != NULL, ("devfs_rmdir_empty: . missing")); de_dotdot = TAILQ_NEXT(de_dot, de_list); KASSERT(de_dotdot != NULL, ("devfs_rmdir_empty: .. missing")); /* Return if the directory is not empty. */ if (TAILQ_NEXT(de_dotdot, de_list) != NULL) return; dd = devfs_parent_dirent(de); KASSERT(dd != NULL, ("devfs_rmdir_empty: NULL dd")); TAILQ_REMOVE(&de->de_dlist, de_dot, de_list); TAILQ_REMOVE(&de->de_dlist, de_dotdot, de_list); TAILQ_REMOVE(&dd->de_dlist, de, de_list); DEVFS_DE_HOLD(dd); devfs_delete(dm, de, DEVFS_DEL_NORECURSE); devfs_delete(dm, de_dot, DEVFS_DEL_NORECURSE); devfs_delete(dm, de_dotdot, DEVFS_DEL_NORECURSE); if (DEVFS_DE_DROP(dd)) { devfs_dirent_free(dd); return; } de = dd; } } /* * The caller needs to hold the dm for the duration of the call since * dm->dm_lock may be temporary dropped. */ void devfs_delete(struct devfs_mount *dm, struct devfs_dirent *de, int flags) { struct devfs_dirent *dd; struct vnode *vp; KASSERT((de->de_flags & DE_DOOMED) == 0, ("devfs_delete doomed dirent")); de->de_flags |= DE_DOOMED; if ((flags & DEVFS_DEL_NORECURSE) == 0) { dd = devfs_parent_dirent(de); if (dd != NULL) DEVFS_DE_HOLD(dd); if (de->de_flags & DE_USER) { KASSERT(dd != NULL, ("devfs_delete: NULL dd")); devfs_dir_unref_de(dm, dd); } } else dd = NULL; mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { vhold(vp); mtx_unlock(&devfs_de_interlock); sx_unlock(&dm->dm_lock); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); VOP_UNLOCK(vp); vdrop(vp); sx_xlock(&dm->dm_lock); } else mtx_unlock(&devfs_de_interlock); if (de->de_symlink) { free(de->de_symlink, M_DEVFS); de->de_symlink = NULL; } #ifdef MAC mac_devfs_destroy(de); #endif if (de->de_inode > DEVFS_ROOTINO) { devfs_free_cdp_inode(de->de_inode); de->de_inode = 0; } if (DEVFS_DE_DROP(de)) devfs_dirent_free(de); if (dd != NULL) { if (DEVFS_DE_DROP(dd)) devfs_dirent_free(dd); else devfs_rmdir_empty(dm, dd); } } /* * Called on unmount. * Recursively removes the entire tree. * The caller needs to hold the dm for the duration of the call. */ static void devfs_purge(struct devfs_mount *dm, struct devfs_dirent *dd) { struct devfs_dirent *de; sx_assert(&dm->dm_lock, SX_XLOCKED); DEVFS_DE_HOLD(dd); for (;;) { /* * Use TAILQ_LAST() to remove "." and ".." last. * We might need ".." to resolve a path in * devfs_dir_unref_de(). */ de = TAILQ_LAST(&dd->de_dlist, devfs_dlist_head); if (de == NULL) break; TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_flags & DE_USER) devfs_dir_unref_de(dm, dd); if (de->de_flags & (DE_DOT | DE_DOTDOT)) devfs_delete(dm, de, DEVFS_DEL_NORECURSE); else if (de->de_dirent->d_type == DT_DIR) devfs_purge(dm, de); else devfs_delete(dm, de, DEVFS_DEL_NORECURSE); } if (DEVFS_DE_DROP(dd)) devfs_dirent_free(dd); else if ((dd->de_flags & DE_DOOMED) == 0) devfs_delete(dm, dd, DEVFS_DEL_NORECURSE); } /* * Each cdev_priv has an array of pointers to devfs_dirent which is indexed * by the mount points dm_idx. * This function extends the array when necessary, taking into account that * the default array is 1 element and not malloc'ed. */ static void devfs_metoo(struct cdev_priv *cdp, struct devfs_mount *dm) { struct devfs_dirent **dep, **olddep; int siz; siz = (dm->dm_idx + 1) * sizeof *dep; dep = malloc(siz, M_DEVFS2, M_WAITOK | M_ZERO); dev_lock(); if (dm->dm_idx <= cdp->cdp_maxdirent) { /* We got raced */ dev_unlock(); free(dep, M_DEVFS2); return; } memcpy(dep, cdp->cdp_dirents, (cdp->cdp_maxdirent + 1) * sizeof *dep); olddep = cdp->cdp_maxdirent > 0 ? cdp->cdp_dirents : NULL; cdp->cdp_dirents = dep; /* * XXX: if malloc told us how much we actually got this could * XXX: be optimized. */ cdp->cdp_maxdirent = dm->dm_idx; dev_unlock(); free(olddep, M_DEVFS2); } /* * The caller needs to hold the dm for the duration of the call. */ static int devfs_populate_loop(struct devfs_mount *dm, int cleanup) { struct cdev_priv *cdp; struct devfs_dirent *de; struct devfs_dirent *dd, *dt; struct cdev *pdev; int de_flags, depth, j; char *q, *s; sx_assert(&dm->dm_lock, SX_XLOCKED); dev_lock(); TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) { KASSERT(cdp->cdp_dirents != NULL, ("NULL cdp_dirents")); KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) != 0, ("%s: cdp %p (%s) should not be on active list", __func__, cdp, cdp->cdp_c.si_name)); /* * If we are unmounting, or the device has been destroyed, * clean up our dirent. */ if ((cleanup || !(cdp->cdp_flags & CDP_ACTIVE)) && dm->dm_idx <= cdp->cdp_maxdirent && cdp->cdp_dirents[dm->dm_idx] != NULL) { de = cdp->cdp_dirents[dm->dm_idx]; cdp->cdp_dirents[dm->dm_idx] = NULL; KASSERT(cdp == de->de_cdp, ("%s %d %s %p %p", __func__, __LINE__, cdp->cdp_c.si_name, cdp, de->de_cdp)); KASSERT(de->de_dir != NULL, ("Null de->de_dir")); dev_unlock(); TAILQ_REMOVE(&de->de_dir->de_dlist, de, de_list); de->de_cdp = NULL; de->de_inode = 0; devfs_delete(dm, de, 0); dev_lock(); cdp->cdp_inuse--; dev_unlock(); return (1); } /* * GC any lingering devices */ if (!(cdp->cdp_flags & CDP_ACTIVE)) { if (cdp->cdp_inuse > 0) continue; cdp->cdp_flags &= ~CDP_ON_ACTIVE_LIST; TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); return (1); } /* * Don't create any new dirents if we are unmounting */ if (cleanup) continue; KASSERT((cdp->cdp_flags & CDP_ACTIVE), ("Bogons, I tell ya'!")); if (dm->dm_idx <= cdp->cdp_maxdirent && cdp->cdp_dirents[dm->dm_idx] != NULL) { de = cdp->cdp_dirents[dm->dm_idx]; KASSERT(cdp == de->de_cdp, ("inconsistent cdp")); continue; } cdp->cdp_inuse++; dev_unlock(); if (dm->dm_idx > cdp->cdp_maxdirent) devfs_metoo(cdp, dm); dd = dm->dm_rootdir; s = cdp->cdp_c.si_name; for (;;) { for (q = s; *q != '/' && *q != '\0'; q++) continue; if (*q != '/') break; de = devfs_find(dd, s, q - s, 0); if (de == NULL) de = devfs_vmkdir(dm, s, q - s, dd, 0); else if (de->de_dirent->d_type == DT_LNK) { de = devfs_find(dd, s, q - s, DT_DIR); if (de == NULL) de = devfs_vmkdir(dm, s, q - s, dd, 0); de->de_flags |= DE_COVERED; } s = q + 1; dd = de; KASSERT(dd->de_dirent->d_type == DT_DIR && (dd->de_flags & (DE_DOT | DE_DOTDOT)) == 0, ("%s: invalid directory (si_name=%s)", __func__, cdp->cdp_c.si_name)); } de_flags = 0; de = devfs_find(dd, s, q - s, DT_LNK); if (de != NULL) de_flags |= DE_COVERED; de = devfs_newdirent(s, q - s); if (cdp->cdp_c.si_flags & SI_ALIAS) { de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_dirent->d_type = DT_LNK; pdev = cdp->cdp_c.si_parent; dt = dd; depth = 0; while (dt != dm->dm_rootdir && (dt = devfs_parent_dirent(dt)) != NULL) depth++; j = depth * 3 + strlen(pdev->si_name) + 1; de->de_symlink = malloc(j, M_DEVFS, M_WAITOK); de->de_symlink[0] = 0; while (depth-- > 0) strcat(de->de_symlink, "../"); strcat(de->de_symlink, pdev->si_name); } else { de->de_uid = cdp->cdp_c.si_uid; de->de_gid = cdp->cdp_c.si_gid; de->de_mode = cdp->cdp_c.si_mode; de->de_dirent->d_type = DT_CHR; } de->de_flags |= de_flags; de->de_inode = cdp->cdp_inode; de->de_cdp = cdp; #ifdef MAC mac_devfs_create_device(cdp->cdp_c.si_cred, dm->dm_mount, &cdp->cdp_c, de); #endif de->de_dir = dd; TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list); devfs_rules_apply(dm, de); dev_lock(); /* XXX: could check that cdp is still active here */ KASSERT(cdp->cdp_dirents[dm->dm_idx] == NULL, ("%s %d\n", __func__, __LINE__)); cdp->cdp_dirents[dm->dm_idx] = de; KASSERT(de->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); dev_unlock(); return (1); } dev_unlock(); return (0); } int devfs_populate_needed(struct devfs_mount *dm) { return (dm->dm_generation != devfs_generation); } /* * The caller needs to hold the dm for the duration of the call. */ void devfs_populate(struct devfs_mount *dm) { unsigned gen; sx_assert(&dm->dm_lock, SX_XLOCKED); if (!devfs_populate_needed(dm)) return; gen = devfs_generation; while (devfs_populate_loop(dm, 0)) continue; dm->dm_generation = gen; } /* * The caller needs to hold the dm for the duration of the call. */ void devfs_cleanup(struct devfs_mount *dm) { sx_assert(&dm->dm_lock, SX_XLOCKED); while (devfs_populate_loop(dm, 1)) continue; devfs_purge(dm, dm->dm_rootdir); } /* * devfs_create() and devfs_destroy() are called from kern_conf.c and * in both cases the devlock() mutex is held, so no further locking * is necessary and no sleeping allowed. */ void devfs_create(struct cdev *dev) { struct cdev_priv *cdp; dev_lock_assert_locked(); cdp = cdev2priv(dev); KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) == 0, ("%s: cdp %p (%s) already on active list", __func__, cdp, dev->si_name)); cdp->cdp_flags |= (CDP_ACTIVE | CDP_ON_ACTIVE_LIST); cdp->cdp_inode = alloc_unrl(devfs_inos); dev_refl(dev); TAILQ_INSERT_TAIL(&cdevp_list, cdp, cdp_list); devfs_generation++; } void devfs_destroy(struct cdev *dev) { struct cdev_priv *cdp; dev_lock_assert_locked(); cdp = cdev2priv(dev); cdp->cdp_flags &= ~CDP_ACTIVE; devfs_generation++; } ino_t devfs_alloc_cdp_inode(void) { return (alloc_unr(devfs_inos)); } void devfs_free_cdp_inode(ino_t ino) { if (ino > 0) free_unr(devfs_inos, ino); } static void devfs_devs_init(void *junk __unused) { devfs_inos = new_unrhdr(DEVFS_ROOTINO + 1, INT_MAX, &devmtx); } SYSINIT(devfs_devs, SI_SUB_DEVFS, SI_ORDER_FIRST, devfs_devs_init, NULL); diff --git a/sys/geom/geom_kern.c b/sys/geom/geom_kern.c index 14707403215d..f8f99087ad9c 100644 --- a/sys/geom/geom_kern.c +++ b/sys/geom/geom_kern.c @@ -1,242 +1,237 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures"); struct sx topology_lock; static struct proc *g_proc; struct thread __read_mostly *g_up_td; struct thread __read_mostly *g_down_td; static struct thread __read_mostly *g_event_td; int __read_mostly g_debugflags; int __read_mostly g_collectstats = G_STATS_PROVIDERS; int g_shutdown; int g_notaste; /* * G_UP and G_DOWN are the two threads which push I/O through the * stack. * * Things are procesed in a FIFO order, but these threads could be * part of I/O prioritization by deciding which bios/bioqs to service * in what order. * * We have only one thread in each direction, it is believed that until * a very non-trivial workload in the UP/DOWN path this will be enough, * but more than one can actually be run without problems. * * Holding the "mymutex" is a debugging feature: It prevents people * from sleeping in the UP/DOWN I/O path by mistake or design (doing * so almost invariably result in deadlocks since it stalls all I/O * processing in the given direction. */ static void g_up_procbody(void *arg) { thread_lock(g_up_td); sched_prio(g_up_td, PRIBIO); thread_unlock(g_up_td); for(;;) { g_io_schedule_up(g_up_td); } } static void g_down_procbody(void *arg) { thread_lock(g_down_td); sched_prio(g_down_td, PRIBIO); thread_unlock(g_down_td); for(;;) { g_io_schedule_down(g_down_td); } } static void g_event_procbody(void *arg) { thread_lock(g_event_td); sched_prio(g_event_td, PRIBIO); thread_unlock(g_event_td); g_run_events(); /* NOTREACHED */ } int g_is_geom_thread(struct thread *td) { return (td == g_up_td || td == g_down_td || td == g_event_td); } static void geom_shutdown(void *foo __unused) { g_shutdown = 1; } void g_init(void) { g_trace(G_T_TOPOLOGY, "g_ignition"); sx_init(&topology_lock, "GEOM topology"); g_io_init(); g_event_init(); g_ctl_init(); kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td, RFHIGHPID, 0, "geom", "g_event"); kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td, RFHIGHPID, 0, "geom", "g_up"); kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td, RFHIGHPID, 0, "geom", "g_down"); EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL, SHUTDOWN_PRI_FIRST); } static int sysctl_kern_geom_confany(struct sysctl_req *req, g_event_t *func, size_t *hint) { size_t len = 0; int error = 0; struct sbuf *sb; if (req->oldptr == NULL) { sb = sbuf_new(NULL, NULL, PAGE_SIZE, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_set_drain(sb, sbuf_count_drain, &len); g_waitfor_event(func, sb, M_WAITOK, NULL); req->oldidx = *hint = len; } else { sb = sbuf_new(NULL, NULL, *hint, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); g_waitfor_event(func, sb, M_WAITOK, NULL); *hint = sbuf_len(sb); error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); } sbuf_delete(sb); return error; } static int sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_conftxt, &hint)); } static int sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confdot, &hint)); } static int sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS) { static size_t hint = PAGE_SIZE; return (sysctl_kern_geom_confany(req, g_confxml, &hint)); } SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOMetry management"); SYSCTL_PROC(_kern_geom, OID_AUTO, confxml, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_geom_confxml, "", "Dump the GEOM config in XML"); SYSCTL_PROC(_kern_geom, OID_AUTO, confdot, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_geom_confdot, "", "Dump the GEOM config in dot"); SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_geom_conftxt, "", "Dump the GEOM config in txt"); SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN, &g_debugflags, 0, "Set various trace levels for GEOM debugging"); SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW, &g_notaste, 0, "Prevent GEOM tasting"); SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW, &g_collectstats, 0, "Control statistics collection on GEOM providers and consumers"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)"); +SYSCTL_SIZEOF_STRUCT(g_class); +SYSCTL_SIZEOF_STRUCT(g_geom); +SYSCTL_SIZEOF_STRUCT(g_provider); +SYSCTL_SIZEOF_STRUCT(g_consumer); +SYSCTL_SIZEOF_STRUCT(g_bioq); diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c index d60c72a00f63..f69275fc3d1d 100644 --- a/sys/kern/kern_mib.c +++ b/sys/kern/kern_mib.c @@ -1,780 +1,774 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD * project, to make these variables more userfriendly. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 */ #include #include "opt_posix.h" #include "opt_config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_ROOT_NODE(0, sysctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sysctl internal magic"); SYSCTL_ROOT_NODE(CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, 0, "High kernel, proc, limits &c"); SYSCTL_ROOT_NODE(CTL_VM, vm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Virtual memory"); SYSCTL_ROOT_NODE(CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "File system"); SYSCTL_ROOT_NODE(CTL_NET, net, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Network, (see socket.h)"); SYSCTL_ROOT_NODE(CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Debugging"); SYSCTL_NODE(_debug, OID_AUTO, sizeof, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sizeof various things"); SYSCTL_ROOT_NODE(CTL_HW, hw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "hardware"); SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "machine dependent"); SYSCTL_NODE(_machdep, OID_AUTO, mitigations, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Machine dependent platform mitigations."); SYSCTL_ROOT_NODE(CTL_USER, user, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "user-level"); SYSCTL_ROOT_NODE(CTL_P1003_1B, p1003_1b, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "p1003_1b, (see p1003_1b.h)"); SYSCTL_ROOT_NODE(OID_AUTO, compat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Compatibility code"); SYSCTL_ROOT_NODE(OID_AUTO, security, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Security"); #ifdef REGRESSION SYSCTL_ROOT_NODE(OID_AUTO, regression, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Regression test MIB"); #endif SYSCTL_CONST_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD, kern_ident, "Kernel identifier"); SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, BSD, "Operating system revision"); SYSCTL_CONST_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, "Kernel version"); SYSCTL_CONST_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD, compiler_version, "Version of compiler used to compile kernel"); SYSCTL_CONST_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD | CTLFLAG_CAPRD, ostype, "Operating system type"); SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxproc, 0, "Maximum number of processes"); SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, &maxprocperuid, 0, "Maximum processes allowed per userid"); SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxusers, 0, "Hint for kernel tuning"); SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, ARG_MAX, "Maximum bytes of argument to execve(2)"); SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, _POSIX_VERSION, "Version of POSIX attempting to comply to"); SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN | CTLFLAG_NOFETCH | CTLFLAG_CAPRD, &ngroups_max, 0, "Maximum number of supplemental groups a user can belong to"); SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, "Whether job control is available"); #ifdef _POSIX_SAVED_IDS SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, "Whether saved set-group/user ID is available"); #else SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available"); #endif char kernelname[MAXPATHLEN] = PATH_KERNEL; /* XXX bloat */ SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, kernelname, sizeof kernelname, "Name of kernel file booted"); #ifdef COMPAT_FREEBSD12 static int sysctl_maxphys(SYSCTL_HANDLER_ARGS) { u_long lvalue; int ivalue; lvalue = maxphys; if (sizeof(int) == sizeof(u_long) || req->oldlen >= sizeof(u_long)) return (sysctl_handle_long(oidp, &lvalue, 0, req)); if (lvalue > INT_MAX) return (sysctl_handle_long(oidp, &lvalue, 0, req)); ivalue = lvalue; return (sysctl_handle_int(oidp, &ivalue, 0, req)); } SYSCTL_PROC(_kern, KERN_MAXPHYS, maxphys, CTLTYPE_LONG | CTLFLAG_RDTUN | CTLFLAG_NOFETCH | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL, 0, sysctl_maxphys, "UL", "Maximum block I/O access size"); #else SYSCTL_ULONG(_kern, KERN_MAXPHYS, maxphys, CTLFLAG_RDTUN | CTLFLAG_NOFETCH | CTLFLAG_CAPRD, &maxphys, 0, "Maximum block I/O access size"); #endif SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncpus, 0, "Number of active CPUs"); SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, BYTE_ORDER, "System byte order"); SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, PAGE_SIZE, "System memory page size"); static int sysctl_kern_arnd(SYSCTL_HANDLER_ARGS) { char buf[256]; size_t len; int error; len = MIN(req->oldlen, sizeof(buf)); read_random(buf, len); error = SYSCTL_OUT(req, buf, len); explicit_bzero(buf, len); return (error); } SYSCTL_PROC(_kern, KERN_ARND, arandom, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0, sysctl_kern_arnd, "", "arc4rand"); static int sysctl_hw_physmem(SYSCTL_HANDLER_ARGS) { u_long val, p; p = SIZE_T_MAX >> PAGE_SHIFT; if (physmem < p) p = physmem; val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_hw_physmem, "LU", "Amount of physical memory (in bytes)"); static int sysctl_hw_realmem(SYSCTL_HANDLER_ARGS) { u_long val, p; p = SIZE_T_MAX >> PAGE_SHIFT; if (realmem < p) p = realmem; val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_hw_realmem, "LU", "Amount of memory (in bytes) reported by the firmware"); static int sysctl_hw_usermem(SYSCTL_HANDLER_ARGS) { u_long val, p, p1; p1 = physmem - vm_wire_count(); p = SIZE_T_MAX >> PAGE_SHIFT; if (p1 < p) p = p1; val = ctob(p); return (sysctl_handle_long(oidp, &val, 0, req)); } SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_hw_usermem, "LU", "Amount of memory (in bytes) which is not wired"); SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "Amount of physical memory (in pages)"); u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE }; static int sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS) { int error; size_t len; #ifdef SCTL_MASK32 int i; uint32_t pagesizes32[MAXPAGESIZES]; if (req->flags & SCTL_MASK32) { /* * Recreate the "pagesizes" array with 32-bit elements. * Truncate any page size greater than UINT32_MAX to zero, * which assumes that page sizes are powers of two. */ for (i = 0; i < MAXPAGESIZES; i++) pagesizes32[i] = (uint32_t)pagesizes[i]; len = sizeof(pagesizes32); if (len > req->oldlen && req->oldptr != NULL) len = req->oldlen; error = SYSCTL_OUT(req, pagesizes32, len); } else #endif { len = sizeof(pagesizes); if (len > req->oldlen && req->oldptr != NULL) len = req->oldlen; error = SYSCTL_OUT(req, pagesizes, len); } return (error); } SYSCTL_PROC(_hw, OID_AUTO, pagesizes, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_pagesizes, "S,pagesizes", "Supported page sizes"); int adaptive_machine_arch = 1; SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW, &adaptive_machine_arch, 1, "Adapt reported machine architecture to the ABI of the binary"); static const char * proc_machine_arch(struct proc *p) { if (p->p_sysent->sv_machine_arch != NULL) return (p->p_sysent->sv_machine_arch(p)); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(p, SV_ILP32)) return (MACHINE_ARCH32); #endif return (MACHINE_ARCH); } static int sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS) { const char *machine_arch; if (adaptive_machine_arch) machine_arch = proc_machine_arch(curproc); else machine_arch = MACHINE_ARCH; return (SYSCTL_OUT(req, machine_arch, strlen(machine_arch) + 1)); } SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine_arch, "A", "System architecture"); #ifdef COMPAT_FREEBSD32 #include #endif static int sysctl_kern_supported_archs(SYSCTL_HANDLER_ARGS) { const char *supported_archs; supported_archs = #ifdef COMPAT_FREEBSD32 compat_freebsd_32bit ? MACHINE_ARCH " " MACHINE_ARCH32 : #endif MACHINE_ARCH; return (SYSCTL_OUT(req, supported_archs, strlen(supported_archs) + 1)); } SYSCTL_PROC(_kern, OID_AUTO, supported_archs, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD | CTLTYPE_STRING, NULL, 0, sysctl_kern_supported_archs, "A", "Supported architectures for binaries"); static int sysctl_hostname(SYSCTL_HANDLER_ARGS) { struct prison *pr, *cpr; size_t pr_offset; char tmpname[MAXHOSTNAMELEN]; int descend, error, len; /* * This function can set: hostname domainname hostuuid. * Keep that in mind when comments say "hostname". */ pr_offset = (size_t)arg1; len = arg2; KASSERT(len <= sizeof(tmpname), ("length %d too long for %s", len, __func__)); /* * Make a local copy of hostname to get/set so we don't have to hold * the jail mutex during the sysctl copyin/copyout activities. */ pr = req->td->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); bcopy((char *)pr + pr_offset, tmpname, len); mtx_unlock(&pr->pr_mtx); error = sysctl_handle_string(oidp, tmpname, len, req); if (error != 0 || req->newptr == NULL) return (error); /* * Copy the locally set hostname to all jails that share * this host info. */ sx_slock(&allprison_lock); if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME)) error = EPERM; else { while (!(pr->pr_flags & PR_HOST)) pr = pr->pr_parent; mtx_lock(&pr->pr_mtx); bcopy(tmpname, (char *)pr + pr_offset, len); FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) if (cpr->pr_flags & PR_HOST) descend = 0; else bcopy(tmpname, (char *)cpr + pr_offset, len); mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); return (error); } SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN, sysctl_hostname, "A", "Hostname"); SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN, sysctl_hostname, "A", "Name of the current YP/NIS domain"); SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN, sysctl_hostname, "A", "Host UUID"); static int regression_securelevel_nonmonotonic = 0; #ifdef REGRESSION SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW, ®ression_securelevel_nonmonotonic, 0, "securelevel may be lowered"); #endif static int sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS) { struct prison *pr, *cpr; int descend, error, level; pr = req->td->td_ucred->cr_prison; /* * Reading the securelevel is easy, since the current jail's level * is known to be at least as secure as any higher levels. Perform * a lockless read since the securelevel is an integer. */ level = pr->pr_securelevel; error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); /* Permit update only if the new securelevel exceeds the old. */ sx_slock(&allprison_lock); mtx_lock(&pr->pr_mtx); if (!regression_securelevel_nonmonotonic && level < pr->pr_securelevel) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EPERM); } pr->pr_securelevel = level; /* * Set all child jails to be at least this level, but do not lower * them (even if regression_securelevel_nonmonotonic). */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) { if (cpr->pr_securelevel < level) cpr->pr_securelevel = level; } mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (error); } SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_securelvl, "I", "Current secure level"); #ifdef INCLUDE_CONFIG_FILE /* Actual kernel configuration options. */ extern const char kernconfstring[]; SYSCTL_CONST_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD, kernconfstring, "Kernel configuration file"); #endif static int sysctl_hostid(SYSCTL_HANDLER_ARGS) { struct prison *pr, *cpr; u_long tmpid; int descend, error; /* * Like sysctl_hostname, except it operates on a u_long * instead of a string, and is used only for hostid. */ pr = req->td->td_ucred->cr_prison; mtx_lock(&pr->pr_mtx); tmpid = pr->pr_hostid; mtx_unlock(&pr->pr_mtx); error = sysctl_handle_long(oidp, &tmpid, 0, req); if (error != 0 || req->newptr == NULL) return (error); sx_slock(&allprison_lock); if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME)) error = EPERM; else { while (!(pr->pr_flags & PR_HOST)) pr = pr->pr_parent; mtx_lock(&pr->pr_mtx); pr->pr_hostid = tmpid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) if (cpr->pr_flags & PR_HOST) descend = 0; else cpr->pr_hostid = tmpid; mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); return (error); } SYSCTL_PROC(_kern, KERN_HOSTID, hostid, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0, sysctl_hostid, "LU", "Host ID"); static struct mtx bootid_lk; MTX_SYSINIT(bootid_lock, &bootid_lk, "bootid generator lock", MTX_DEF); static int sysctl_bootid(SYSCTL_HANDLER_ARGS) { static uint8_t boot_id[16]; static bool initialized = false; mtx_lock(&bootid_lk); if (!initialized) { if (!is_random_seeded()) { mtx_unlock(&bootid_lk); return (ENXIO); } arc4random_buf(boot_id, sizeof(boot_id)); initialized = true; } mtx_unlock(&bootid_lk); return (SYSCTL_OUT(req, boot_id, sizeof(boot_id))); } SYSCTL_PROC(_kern, OID_AUTO, boot_id, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0, sysctl_bootid, "", "Random boot ID"); /* * The osrelease string is copied from the global (osrelease in vers.c) into * prison0 by a sysinit and is inherited by child jails if not changed at jail * creation, so we always return the copy from the current prison data. */ static int sysctl_osrelease(SYSCTL_HANDLER_ARGS) { struct prison *pr; pr = req->td->td_ucred->cr_prison; return (SYSCTL_OUT(req, pr->pr_osrelease, strlen(pr->pr_osrelease) + 1)); } SYSCTL_PROC(_kern, KERN_OSRELEASE, osrelease, CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_osrelease, "A", "Operating system release"); /* * The osreldate number is copied from the global (osreldate in vers.c) into * prison0 by a sysinit and is inherited by child jails if not changed at jail * creation, so we always return the value from the current prison data. */ static int sysctl_osreldate(SYSCTL_HANDLER_ARGS) { struct prison *pr; pr = req->td->td_ucred->cr_prison; return (SYSCTL_OUT(req, &pr->pr_osreldate, sizeof(pr->pr_osreldate))); } /* * NOTICE: The *userland* release date is available in * /usr/include/osreldate.h */ SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate, CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_osreldate, "I", "Kernel release date"); /* * The build-id is copied from the ELF section .note.gnu.build-id. The linker * script defines two variables to expose the beginning and end. LLVM * currently uses a SHA-1 hash, but other formats can be supported by checking * the length of the section. */ extern char __build_id_start[]; extern char __build_id_end[]; #define BUILD_ID_HEADER_LEN 0x10 #define BUILD_ID_HASH_MAXLEN 0x14 static int sysctl_build_id(SYSCTL_HANDLER_ARGS) { uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start); int hashlen; char buf[2*BUILD_ID_HASH_MAXLEN+1]; /* * The ELF note section has a four byte length for the vendor name, * four byte length for the value, and a four byte vendor specific * type. The name for the build id is "GNU\0". We skip the first 16 * bytes to read the build hash. We will return the remaining bytes up * to 20 (SHA-1) hash size. If the hash happens to be a custom number * of bytes we will pad the value with zeros, as the section should be * four byte aligned. */ if (sectionlen <= BUILD_ID_HEADER_LEN || sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) { return (ENOENT); } hashlen = sectionlen - BUILD_ID_HEADER_LEN; for (int i = 0; i < hashlen; i++) { uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN]; snprintf(&buf[2*i], 3, "%02x", c); } return (SYSCTL_OUT(req, buf, strlen(buf) + 1)); } SYSCTL_PROC(_kern, OID_AUTO, build_id, CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_build_id, "A", "Operating system build-id"); SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Kernel Features"); #ifdef COMPAT_FREEBSD4 FEATURE(compat_freebsd4, "Compatible with FreeBSD 4"); #endif #ifdef COMPAT_FREEBSD5 FEATURE(compat_freebsd5, "Compatible with FreeBSD 5"); #endif #ifdef COMPAT_FREEBSD6 FEATURE(compat_freebsd6, "Compatible with FreeBSD 6"); #endif #ifdef COMPAT_FREEBSD7 FEATURE(compat_freebsd7, "Compatible with FreeBSD 7"); #endif #ifdef COMPAT_FREEBSD8 FEATURE(compat_freebsd8, "Compatible with FreeBSD 8"); #endif #ifdef COMPAT_FREEBSD9 FEATURE(compat_freebsd9, "Compatible with FreeBSD 9"); #endif #ifdef COMPAT_FREEBSD10 FEATURE(compat_freebsd10, "Compatible with FreeBSD 10"); #endif #ifdef COMPAT_FREEBSD11 FEATURE(compat_freebsd11, "Compatible with FreeBSD 11"); #endif #ifdef COMPAT_FREEBSD12 FEATURE(compat_freebsd12, "Compatible with FreeBSD 12"); #endif #ifdef COMPAT_FREEBSD13 FEATURE(compat_freebsd13, "Compatible with FreeBSD 13"); #endif /* * This is really cheating. These actually live in the libc, something * which I'm not quite sure is a good idea anyway, but in order for * getnext and friends to actually work, we define dummies here. * * XXXRW: These probably should be CTLFLAG_CAPRD. */ SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "PATH that finds all the standard utilities"); SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Max ibase/obase values in bc(1)"); SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Max array size in bc(1)"); SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Max scale value in bc(1)"); SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Max string length in bc(1)"); SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry"); SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, ""); SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Max length (bytes) of a text-processing utility's input line"); SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Maximum number of repeats of a regexp permitted"); SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "The version of POSIX 1003.2 with which the system attempts to comply"); SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether C development supports the C bindings option"); SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports the C development utilities option"); SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, ""); SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN development utilities"); SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN runtime utilities"); SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports creation of locales"); SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports software development utilities"); SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Whether system supports the user portability utilities"); SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of streams a process may have open at one time"); SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of types supported for timezone names"); static char localbase[MAXPATHLEN] = ""; SYSCTL_STRING(_user, USER_LOCALBASE, localbase, CTLFLAG_RWTUN, localbase, sizeof(localbase), "Prefix used to install and locate add-on packages"); #include -SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct vnode), "sizeof(struct vnode)"); +SYSCTL_SIZEOF_STRUCT(vnode); -SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct proc), "sizeof(struct proc)"); +SYSCTL_SIZEOF_STRUCT(proc); static int sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS) { int error, pm; pm = pid_max; error = sysctl_handle_int(oidp, &pm, 0, req); if (error || !req->newptr) return (error); sx_xlock(&proctree_lock); sx_xlock(&allproc_lock); /* * Only permit the values less then PID_MAX. * As a safety measure, do not allow to limit the pid_max too much. */ if (pm < 300 || pm > PID_MAX) error = EINVAL; else pid_max = pm; sx_xunlock(&allproc_lock); sx_xunlock(&proctree_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_pid_max, "I", "Maximum allowed pid"); #include #include -SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct bio), "sizeof(struct bio)"); -SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct buf), "sizeof(struct buf)"); +SYSCTL_SIZEOF_STRUCT(bio); +SYSCTL_SIZEOF_STRUCT(buf); #include -SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)"); +SYSCTL_SIZEOF_STRUCT(kinfo_proc); /* Used by kernel debuggers. */ const int pcb_size = sizeof(struct pcb); -SYSCTL_INT(_debug_sizeof, OID_AUTO, pcb, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct pcb), "sizeof(struct pcb)"); +SYSCTL_SIZEOF_STRUCT(pcb); /* XXX compatibility, remove for 6.0 */ #include #include SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW, &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)), "compatibility for kern.fallback_elf_brand"); diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c index 6c0977d1cc35..3a107ac30390 100644 --- a/sys/kern/subr_devstat.c +++ b/sys/kern/subr_devstat.c @@ -1,604 +1,603 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DEFINE(io); SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *"); SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *"); #define DTRACE_DEVSTAT_BIO_START() SDT_PROBE2(io, , , start, bp, ds) #define DTRACE_DEVSTAT_BIO_DONE() SDT_PROBE2(io, , , done, bp, ds) static int devstat_num_devs; static long devstat_generation = 1; static int devstat_version = DEVSTAT_VERSION; static int devstat_current_devnumber; static struct mtx devstat_mutex; MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); static struct devstat *devstat_alloc(void); static void devstat_free(struct devstat *); static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority); /* * Allocate a devstat and initialize it */ struct devstat * devstat_new_entry(const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstat *ds; mtx_assert(&devstat_mutex, MA_NOTOWNED); ds = devstat_alloc(); mtx_lock(&devstat_mutex); if (unit_number == -1) { ds->unit_number = unit_number; ds->id = dev_name; binuptime(&ds->creation_time); devstat_generation++; } else { devstat_add_entry(ds, dev_name, unit_number, block_size, flags, device_type, priority); } mtx_unlock(&devstat_mutex); return (ds); } /* * Take a malloced and zeroed devstat structure given to us, fill it in * and add it to the queue of devices. */ static void devstat_add_entry(struct devstat *ds, const void *dev_name, int unit_number, uint32_t block_size, devstat_support_flags flags, devstat_type_flags device_type, devstat_priority priority) { struct devstatlist *devstat_head; struct devstat *ds_tmp; mtx_assert(&devstat_mutex, MA_OWNED); devstat_num_devs++; devstat_head = &device_statq; /* * Priority sort. Each driver passes in its priority when it adds * its devstat entry. Drivers are sorted first by priority, and * then by probe order. * * For the first device, we just insert it, since the priority * doesn't really matter yet. Subsequent devices are inserted into * the list using the order outlined above. */ if (devstat_num_devs == 1) STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); else { STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { struct devstat *ds_next; ds_next = STAILQ_NEXT(ds_tmp, dev_links); /* * If we find a break between higher and lower * priority items, and if this item fits in the * break, insert it. This also applies if the * "lower priority item" is the end of the list. */ if ((priority <= ds_tmp->priority) && ((ds_next == NULL) || (priority > ds_next->priority))) { STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, dev_links); break; } else if (priority > ds_tmp->priority) { /* * If this is the case, we should be able * to insert ourselves at the head of the * list. If we can't, something is wrong. */ if (ds_tmp == STAILQ_FIRST(devstat_head)) { STAILQ_INSERT_HEAD(devstat_head, ds, dev_links); break; } else { STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); printf("devstat_add_entry: HELP! " "sorting problem detected " "for name %p unit %d\n", dev_name, unit_number); break; } } } } ds->device_number = devstat_current_devnumber++; ds->unit_number = unit_number; strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); ds->block_size = block_size; ds->flags = flags; ds->device_type = device_type; ds->priority = priority; binuptime(&ds->creation_time); devstat_generation++; } /* * Remove a devstat structure from the list of devices. */ void devstat_remove_entry(struct devstat *ds) { struct devstatlist *devstat_head; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (ds == NULL) return; mtx_lock(&devstat_mutex); devstat_head = &device_statq; /* Remove this entry from the devstat queue */ atomic_add_acq_int(&ds->sequence1, 1); if (ds->unit_number != -1) { devstat_num_devs--; STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); } devstat_free(ds); devstat_generation++; mtx_unlock(&devstat_mutex); } /* * Record a transaction start. * * See comments for devstat_end_transaction(). Ordering is very important * here. */ void devstat_start_transaction(struct devstat *ds, const struct bintime *now) { /* sanity check */ if (ds == NULL) return; atomic_add_acq_int(&ds->sequence1, 1); /* * We only want to set the start time when we are going from idle * to busy. The start time is really the start of the latest busy * period. */ if (atomic_fetchadd_int(&ds->start_count, 1) == ds->end_count) { if (now != NULL) ds->busy_from = *now; else binuptime(&ds->busy_from); } atomic_add_rel_int(&ds->sequence0, 1); } void devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) { /* sanity check */ if (ds == NULL) return; binuptime(&bp->bio_t0); devstat_start_transaction_bio_t0(ds, bp); } void devstat_start_transaction_bio_t0(struct devstat *ds, struct bio *bp) { /* sanity check */ if (ds == NULL) return; devstat_start_transaction(ds, &bp->bio_t0); DTRACE_DEVSTAT_BIO_START(); } /* * Record the ending of a transaction, and incrment the various counters. * * Ordering in this function, and in devstat_start_transaction() is VERY * important. The idea here is to run without locks, so we are very * careful to only modify some fields on the way "down" (i.e. at * transaction start) and some fields on the way "up" (i.e. at transaction * completion). One exception is busy_from, which we only modify in * devstat_start_transaction() when there are no outstanding transactions, * and thus it can't be modified in devstat_end_transaction() * simultaneously. * * The sequence0 and sequence1 fields are provided to enable an application * spying on the structures with mmap(2) to tell when a structure is in a * consistent state or not. * * For this to work 100% reliably, it is important that the two fields * are at opposite ends of the structure and that they are incremented * in the opposite order of how a memcpy(3) in userland would copy them. * We assume that the copying happens front to back, but there is actually * no way short of writing your own memcpy(3) replacement to guarantee * this will be the case. * * In addition to this, being a kind of locks, they must be updated with * atomic instructions using appropriate memory barriers. */ void devstat_end_transaction(struct devstat *ds, uint32_t bytes, devstat_tag_type tag_type, devstat_trans_flags flags, const struct bintime *now, const struct bintime *then) { struct bintime dt, lnow; /* sanity check */ if (ds == NULL) return; if (now == NULL) { binuptime(&lnow); now = &lnow; } atomic_add_acq_int(&ds->sequence1, 1); /* Update byte and operations counts */ ds->bytes[flags] += bytes; ds->operations[flags]++; /* * Keep a count of the various tag types sent. */ if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && tag_type != DEVSTAT_TAG_NONE) ds->tag_types[tag_type]++; if (then != NULL) { /* Update duration of operations */ dt = *now; bintime_sub(&dt, then); bintime_add(&ds->duration[flags], &dt); } /* Accumulate busy time */ dt = *now; bintime_sub(&dt, &ds->busy_from); bintime_add(&ds->busy_time, &dt); ds->busy_from = *now; ds->end_count++; atomic_add_rel_int(&ds->sequence0, 1); } void devstat_end_transaction_bio(struct devstat *ds, const struct bio *bp) { devstat_end_transaction_bio_bt(ds, bp, NULL); } void devstat_end_transaction_bio_bt(struct devstat *ds, const struct bio *bp, const struct bintime *now) { devstat_trans_flags flg; devstat_tag_type tag; /* sanity check */ if (ds == NULL) return; if (bp->bio_flags & BIO_ORDERED) tag = DEVSTAT_TAG_ORDERED; else tag = DEVSTAT_TAG_SIMPLE; if (bp->bio_cmd == BIO_DELETE) flg = DEVSTAT_FREE; else if ((bp->bio_cmd == BIO_READ) || ((bp->bio_cmd == BIO_ZONE) && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES))) flg = DEVSTAT_READ; else if (bp->bio_cmd == BIO_WRITE) flg = DEVSTAT_WRITE; else flg = DEVSTAT_NO_DATA; devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, tag, flg, now, &bp->bio_t0); DTRACE_DEVSTAT_BIO_DONE(); } /* * This is the sysctl handler for the devstat package. The data pushed out * on the kern.devstat.all sysctl variable consists of the current devstat * generation number, and then an array of devstat structures, one for each * device in the system. * * This is more cryptic that obvious, but basically we neither can nor * want to hold the devstat_mutex for any amount of time, so we grab it * only when we need to and keep an eye on devstat_generation all the time. */ static int sysctl_devstat(SYSCTL_HANDLER_ARGS) { int error; long mygen; struct devstat *nds; mtx_assert(&devstat_mutex, MA_NOTOWNED); /* * XXX devstat_generation should really be "volatile" but that * XXX freaks out the sysctl macro below. The places where we * XXX change it and inspect it are bracketed in the mutex which * XXX guarantees us proper write barriers. I don't believe the * XXX compiler is allowed to optimize mygen away across calls * XXX to other functions, so the following is belived to be safe. */ mygen = devstat_generation; error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); if (devstat_num_devs == 0) return(0); if (error != 0) return (error); mtx_lock(&devstat_mutex); nds = STAILQ_FIRST(&device_statq); if (mygen != devstat_generation) error = EBUSY; mtx_unlock(&devstat_mutex); if (error != 0) return (error); for (;nds != NULL;) { error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); if (error != 0) return (error); mtx_lock(&devstat_mutex); if (mygen != devstat_generation) error = EBUSY; else nds = STAILQ_NEXT(nds, dev_links); mtx_unlock(&devstat_mutex); if (error != 0) return (error); } return(error); } /* * Sysctl entries for devstat. The first one is a node that all the rest * hang off of. */ static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Device Statistics"); SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD | CTLTYPE_OPAQUE | CTLFLAG_MPSAFE, NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); /* * Export the number of devices in the system so that userland utilities * can determine how much memory to allocate to hold all the devices. */ SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs, 0, "Number of devices in the devstat list"); SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, &devstat_generation, 0, "Devstat list generation"); SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version, 0, "Devstat list version number"); /* * Allocator for struct devstat structures. We sub-allocate these from pages * which we get from malloc. These pages are exported for mmap(2)'ing through * a miniature device driver */ #define statsperpage (PAGE_SIZE / sizeof(struct devstat)) static d_ioctl_t devstat_ioctl; static d_mmap_t devstat_mmap; static struct cdevsw devstat_cdevsw = { .d_version = D_VERSION, .d_ioctl = devstat_ioctl, .d_mmap = devstat_mmap, .d_name = "devstat", }; struct statspage { TAILQ_ENTRY(statspage) list; struct devstat *stat; u_int nfree; }; static size_t pagelist_pages = 0; static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); static int devstat_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { int error = ENOTTY; switch (cmd) { case DIOCGMEDIASIZE: error = 0; *(off_t *)data = pagelist_pages * PAGE_SIZE; break; } return (error); } static int devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct statspage *spp; if (nprot != VM_PROT_READ) return (-1); mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) { if (offset == 0) { *paddr = vtophys(spp->stat); mtx_unlock(&devstat_mutex); return (0); } offset -= PAGE_SIZE; } mtx_unlock(&devstat_mutex); return (-1); } static struct devstat * devstat_alloc(void) { struct devstat *dsp; struct statspage *spp, *spp2; u_int u; static int once; mtx_assert(&devstat_mutex, MA_NOTOWNED); if (!once) { make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444, DEVSTAT_DEVICE_NAME); once = 1; } spp2 = NULL; mtx_lock(&devstat_mutex); for (;;) { TAILQ_FOREACH(spp, &pagelist, list) { if (spp->nfree > 0) break; } if (spp != NULL) break; mtx_unlock(&devstat_mutex); spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); spp2->nfree = statsperpage; /* * If free statspages were added while the lock was released * just reuse them. */ mtx_lock(&devstat_mutex); TAILQ_FOREACH(spp, &pagelist, list) if (spp->nfree > 0) break; if (spp == NULL) { spp = spp2; /* * It would make more sense to add the new page at the * head but the order on the list determine the * sequence of the mapping so we can't do that. */ pagelist_pages++; TAILQ_INSERT_TAIL(&pagelist, spp, list); } else break; } dsp = spp->stat; for (u = 0; u < statsperpage; u++) { if (dsp->allocated == 0) break; dsp++; } spp->nfree--; dsp->allocated = 1; mtx_unlock(&devstat_mutex); if (spp2 != NULL && spp2 != spp) { free(spp2->stat, M_DEVSTAT); free(spp2, M_DEVSTAT); } return (dsp); } static void devstat_free(struct devstat *dsp) { struct statspage *spp; mtx_assert(&devstat_mutex, MA_OWNED); bzero(dsp, sizeof *dsp); TAILQ_FOREACH(spp, &pagelist, list) { if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { spp->nfree++; return; } } } -SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, - SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)"); +SYSCTL_SIZEOF_STRUCT(devstat); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 6f92130e12b3..300173347401 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -1,6310 +1,6309 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #ifdef DDB #include #endif #include /* * High level overview of name caching in the VFS layer. * * Originally caching was implemented as part of UFS, later extracted to allow * use by other filesystems. A decision was made to make it optional and * completely detached from the rest of the kernel, which comes with limitations * outlined near the end of this comment block. * * This fundamental choice needs to be revisited. In the meantime, the current * state is described below. Significance of all notable routines is explained * in comments placed above their implementation. Scattered thoroughout the * file are TODO comments indicating shortcomings which can be fixed without * reworking everything (most of the fixes will likely be reusable). Various * details are omitted from this explanation to not clutter the overview, they * have to be checked by reading the code and associated commentary. * * Keep in mind that it's individual path components which are cached, not full * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries, * one for each name. * * I. Data organization * * Entries are described by "struct namecache" objects and stored in a hash * table. See cache_get_hash for more information. * * "struct vnode" contains pointers to source entries (names which can be found * when traversing through said vnode), destination entries (names of that * vnode (see "Limitations" for a breakdown on the subject) and a pointer to * the parent vnode. * * The (directory vnode; name) tuple reliably determines the target entry if * it exists. * * Since there are no small locks at this time (all are 32 bytes in size on * LP64), the code works around the problem by introducing lock arrays to * protect hash buckets and vnode lists. * * II. Filesystem integration * * Filesystems participating in name caching do the following: * - set vop_lookup routine to vfs_cache_lookup * - set vop_cachedlookup to whatever can perform the lookup if the above fails * - if they support lockless lookup (see below), vop_fplookup_vexec and * vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the * mount point * - call cache_purge or cache_vop_* routines to eliminate stale entries as * applicable * - call cache_enter to add entries depending on the MAKEENTRY flag * * With the above in mind, there are 2 entry points when doing lookups: * - ... -> namei -> cache_fplookup -- this is the default * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei * should the above fail * * Example code flow how an entry is added: * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP -> * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter * * III. Performance considerations * * For lockless case forward lookup avoids any writes to shared areas apart * from the terminal path component. In other words non-modifying lookups of * different files don't suffer any scalability problems in the namecache. * Looking up the same file is limited by VFS and goes beyond the scope of this * file. * * At least on amd64 the single-threaded bottleneck for long paths is hashing * (see cache_get_hash). There are cases where the code issues acquire fence * multiple times, they can be combined on architectures which suffer from it. * * For locked case each encountered vnode has to be referenced and locked in * order to be handed out to the caller (normally that's namei). This * introduces significant hit single-threaded and serialization multi-threaded. * * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached -- * avoids any writes to shared areas to any components. * * Unrelated insertions are partially serialized on updating the global entry * counter and possibly serialized on colliding bucket or vnode locks. * * IV. Observability * * Note not everything has an explicit dtrace probe nor it should have, thus * some of the one-liners below depend on implementation details. * * Examples: * * # Check what lookups failed to be handled in a lockless manner. Column 1 is * # line number, column 2 is status code (see cache_fpl_status) * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }' * * # Lengths of names added by binary name * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }' * * # Same as above but only those which exceed 64 characters * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }' * * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what * # path is it * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }' * * V. Limitations and implementation defects * * - since it is possible there is no entry for an open file, tools like * "procstat" may fail to resolve fd -> vnode -> path to anything * - even if a filesystem adds an entry, it may get purged (e.g., due to memory * shortage) in which case the above problem applies * - hardlinks are not tracked, thus if a vnode is reachable in more than one * way, resolving a name may return a different path than the one used to * open it (even if said path is still valid) * - by default entries are not added for newly created files * - adding an entry may need to evict negative entry first, which happens in 2 * distinct places (evicting on lookup, adding in a later VOP) making it * impossible to simply reuse it * - there is a simple scheme to evict negative entries as the cache is approaching * its capacity, but it is very unclear if doing so is a good idea to begin with * - vnodes are subject to being recycled even if target inode is left in memory, * which loses the name cache entries when it perhaps should not. in case of tmpfs * names get duplicated -- kept by filesystem itself and namecache separately * - struct namecache has a fixed size and comes in 2 variants, often wasting space. * now hard to replace with malloc due to dependence on SMR. * - lack of better integration with the kernel also turns nullfs into a layered * filesystem instead of something which can take advantage of caching */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache"); SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *", "const char *"); SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *", "struct namecache *", "int", "int"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *", "struct componentname *"); SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t"); SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); static char __read_frequently cache_fast_lookup_enabled = true; /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; u_char neg_hit; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ int nc_pad; struct namecache nc_nc; }; TAILQ_HEAD(cache_freebatch, namecache); /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) /* * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the * 4.4 BSD codebase. Later on struct namecache was tweaked to become * smaller and the value was bumped to retain the total size, but it * was never re-evaluated for suitability. A simple test counting * lengths during package building shows that the value of 45 covers * about 86% of all added entries, reaching 99% at 65. * * Regardless of the above, use of dedicated zones instead of malloc may be * inducing additional waste. This may be hard to address as said zones are * tied to VFS SMR. Even if retaining them, the current split should be * re-evaluated. */ #ifdef __LP64__ #define CACHE_PATH_CUTOFF 45 #define CACHE_LARGE_PAD 6 #else #define CACHE_PATH_CUTOFF 41 #define CACHE_LARGE_PAD 2 #endif #define CACHE_ZONE_SMALL_SIZE (offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE) #define CACHE_ZONE_LARGE_SIZE (offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD) #define CACHE_ZONE_LARGE_TS_SIZE (offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 static bool cache_neg_evict_cond(u_long lnumcache); /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ #define cache_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0); \ }) /* * Like the above but also checks NCF_WHITE. */ #define cache_fpl_neg_ncp_canuse(ncp) ({ \ struct namecache *_ncp = (ncp); \ u_char _nc_flag; \ \ atomic_thread_fence_acq(); \ _nc_flag = atomic_load_char(&_ncp->nc_flag); \ __predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0); \ }) VFS_SMR_DECLARE; static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache parameters"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0, "Total namecache capacity"); u_int ncsizefactor = 2; SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); /* * Negative entry % of namecache capacity above which automatic eviction is allowed. * * Check cache_neg_evict_cond for details. */ static u_int ncnegminpct = 3; static u_int __read_mostly neg_min; /* the above recomputed against ncsize */ SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0, "Negative entry count above which automatic eviction is allowed"); /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ struct nchstats nchstats; /* cache effectiveness statistics */ static u_int __exclusive_cache_line neg_cycle; #define ncneghash 3 #define numneglists (ncneghash + 1) struct neglist { struct mtx nl_evict_lock; struct mtx nl_lock __aligned(CACHE_LINE_SIZE); TAILQ_HEAD(, namecache) nl_list; TAILQ_HEAD(, namecache) nl_hotlist; u_long nl_hotnum; } __aligned(CACHE_LINE_SIZE); static struct neglist neglists[numneglists]; static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct mtx_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_time; *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ -SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, - sizeof(struct namecache), "sizeof(struct namecache)"); +SYSCTL_SIZEOF_STRUCT(namecache); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); #define STATNODE_COUNTER(name, varname, descr) \ static COUNTER_U64_DEFINE_EARLY(varname); \ SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \ descr); STATNODE_ULONG(neg, numneg, "Number of negative cache entries"); STATNODE_ULONG(count, numcache, "Number of cache entries"); STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(miss, nummiss, "Number of cache misses"); STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(poszaps, numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(negzaps, numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(fullpathfail2, numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls"); STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache"); /* * Debug or developer statistics. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache debugging"); #define DEBUGNODE_ULONG(name, varname, descr) \ SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr); static u_long zap_bucket_relock_success; DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success, "Number of successful removals after relocking"); static u_long zap_bucket_fail; DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, ""); static u_long zap_bucket_fail2; DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, ""); static u_long cache_lock_vnodes_cel_3_failures; DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend); static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * Directory vnodes with entries are held for two reasons: * 1. make them less of a target for reclamation in vnlru * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided * * It will be feasible to stop doing it altogether if all filesystems start * supporting lockless lookup. */ static void cache_hold_vnode(struct vnode *vp) { cache_assert_vnode_locked(vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); vhold(vp); counter_u64_add(numcachehv, 1); } static void cache_drop_vnode(struct vnode *vp) { /* * Called after all locks are dropped, meaning we can't assert * on the state of v_cache_src. */ vdrop(vp); counter_u64_add(numcachehv, -1); } /* * UMA zones. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; char * cache_symlink_alloc(size_t size, int flags) { if (size < CACHE_ZONE_SMALL_SIZE) { return (uma_zalloc_smr(cache_zone_small, flags)); } if (size < CACHE_ZONE_LARGE_SIZE) { return (uma_zalloc_smr(cache_zone_large, flags)); } counter_u64_add(symlinktoobig, 1); SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size); return (NULL); } void cache_symlink_free(char *string, size_t size) { MPASS(string != NULL); KASSERT(size < CACHE_ZONE_LARGE_SIZE, ("%s: size %zu too big", __func__, size)); if (size < CACHE_ZONE_SMALL_SIZE) { uma_zfree_smr(cache_zone_small, string); return; } if (size < CACHE_ZONE_LARGE_SIZE) { uma_zfree_smr(cache_zone_large, string); return; } __assert_unreachable(); } static struct namecache * cache_alloc_uma(int len, bool ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free_uma(struct namecache *ncp) { struct namecache_ts *ncp_ts; if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static struct namecache * cache_alloc(int len, bool ts) { u_long lnumcache; /* * Avoid blowout in namecache entries. * * Bugs: * 1. filesystems may end up trying to add an already existing entry * (for example this can happen after a cache miss during concurrent * lookup), in which case we will call cache_neg_evict despite not * adding anything. * 2. the routine may fail to free anything and no provisions are made * to make it try harder (see the inside for failure modes) * 3. it only ever looks at negative entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (cache_neg_evict_cond(lnumcache)) { lnumcache = atomic_load_long(&numcache); } if (__predict_false(lnumcache >= ncsize)) { atomic_subtract_long(&numcache, 1); counter_u64_add(numdrops, 1); return (NULL); } return (cache_alloc_uma(len, ts)); } static void cache_free(struct namecache *ncp) { MPASS(ncp != NULL); if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); atomic_subtract_long(&numcache, 1); } static void cache_free_batch(struct cache_freebatch *batch) { struct namecache *ncp, *nnp; int i; i = 0; if (TAILQ_EMPTY(batch)) goto out; TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) { if ((ncp->nc_flag & NCF_DVDROP) != 0) { cache_drop_vnode(ncp->nc_dvp); } cache_free_uma(ncp); i++; } atomic_subtract_long(&numcache, i); out: SDT_PROBE1(vfs, namecache, purge, batch, i); } /* * Hashing. * * The code was made to use FNV in 2001 and this choice needs to be revisited. * * Short summary of the difficulty: * The longest name which can be inserted is NAME_MAX characters in length (or * 255 at the time of writing this comment), while majority of names used in * practice are significantly shorter (mostly below 10). More importantly * majority of lookups performed find names are even shorter than that. * * This poses a problem where hashes which do better than FNV past word size * (or so) tend to come with additional overhead when finalizing the result, * making them noticeably slower for the most commonly used range. * * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c * * When looking it up the most time consuming part by a large margin (at least * on amd64) is hashing. Replacing FNV with something which pessimizes short * input would make the slowest part stand out even more. */ /* * TODO: With the value stored we can do better than computing the hash based * on the address. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static uint32_t cache_get_hash_iter_start(struct vnode *dvp) { return (dvp->v_nchash); } static uint32_t cache_get_hash_iter(char c, uint32_t hash) { return (fnv_32_buf(&c, 1, hash)); } static uint32_t cache_get_hash_iter_finish(uint32_t hash) { return (hash); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct mtx * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_OWNED); } static void cache_assert_bucket_unlocked(struct namecache *ncp) { struct mtx *blp; blp = NCP2BUCKETLOCK(ncp); mtx_assert(blp, MA_NOTOWNED); } #else #define cache_assert_bucket_locked(x) do { } while (0) #define cache_assert_bucket_unlocked(x) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_lock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) mtx_unlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); static int sysctl_hitpct(SYSCTL_HANDLER_ARGS) { long poshits, neghits, miss, total; long pct; poshits = counter_u64_fetch(numposhits); neghits = counter_u64_fetch(numneghits); miss = counter_u64_fetch(nummiss); total = poshits + neghits + miss; pct = 0; if (total != 0) pct = ((poshits + neghits) * 100) / total; return (sysctl_handle_int(oidp, 0, pct, req)); } SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct, "I", "Percentage of hits"); static void cache_recalc_neg_min(void) { neg_min = (ncsize * ncnegminpct) / 100; } static int sysctl_negminpct(SYSCTL_HANDLER_ARGS) { u_int val; int error; val = ncnegminpct; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == ncnegminpct) return (0); if (val < 0 || val > 99) return (EINVAL); ncnegminpct = val; cache_recalc_neg_min(); return (0); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct, "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed"); #ifdef DEBUG_CACHE /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * Various workloads create plenty of negative entries and barely use them * afterwards. Moreover malicious users can keep performing bogus lookups * adding even more entries. For example "make tinderbox" as of writing this * comment ends up with 2.6M namecache entries in total, 1.2M of which are * negative. * * As such, a rather aggressive eviction method is needed. The currently * employed method is a placeholder. * * Entries are split over numneglists separate lists, each of which is further * split into hot and cold entries. Entries get promoted after getting a hit. * Eviction happens on addition of new entry. */ static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache negative entry statistics"); SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0, "Number of negative cache entries"); static COUNTER_U64_DEFINE_EARLY(neg_created); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created, "Number of created negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evicted); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted, "Number of evicted negative entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD, &neg_evict_skipped_empty, "Number of times evicting failed due to lack of entries"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD, &neg_evict_skipped_missed, "Number of times evicting failed due to target entry disappearing"); static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD, &neg_evict_skipped_contended, "Number of times evicting failed due to contention"); SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits, "Number of cache hits (negative)"); static int sysctl_neg_hot(SYSCTL_HANDLER_ARGS) { int i, out; out = 0; for (i = 0; i < numneglists; i++) out += neglists[i].nl_hotnum; return (SYSCTL_OUT(req, &out, sizeof(out))); } SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I", "Number of hot negative entries"); static void cache_neg_init(struct namecache *ncp) { struct negstate *ns; ncp->nc_flag |= NCF_NEGATIVE; ns = NCP2NEGSTATE(ncp); ns->neg_flag = 0; ns->neg_hit = 0; counter_u64_add(neg_created, 1); } #define CACHE_NEG_PROMOTION_THRESH 2 static bool cache_neg_hit_prep(struct namecache *ncp) { struct negstate *ns; u_char n; ns = NCP2NEGSTATE(ncp); n = atomic_load_char(&ns->neg_hit); for (;;) { if (n >= CACHE_NEG_PROMOTION_THRESH) return (false); if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1)) break; } return (n + 1 == CACHE_NEG_PROMOTION_THRESH); } /* * Nothing to do here but it is provided for completeness as some * cache_neg_hit_prep callers may end up returning without even * trying to promote. */ #define cache_neg_hit_abort(ncp) do { } while (0) static void cache_neg_hit_finish(struct namecache *ncp) { SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name); counter_u64_add(numneghits, 1); } /* * Move a negative entry to the hot list. */ static void cache_neg_promote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); if ((ns->neg_flag & NEG_HOT) == 0) { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum++; ns->neg_flag |= NEG_HOT; } } /* * Move a hot negative entry to the cold list. */ static void cache_neg_demote_locked(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; ns = NCP2NEGSTATE(ncp); nl = NCP2NEGLIST(ncp); mtx_assert(&nl->nl_lock, MA_OWNED); MPASS(ns->neg_flag & NEG_HOT); TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); nl->nl_hotnum--; ns->neg_flag &= ~NEG_HOT; atomic_store_char(&ns->neg_hit, 0); } /* * Move a negative entry to the hot list if it matches the lookup. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Exiting the section means * the found entry could have been evicted. We are going to look it * up again. */ static bool cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp, struct namecache *oncp, uint32_t hash) { struct namecache *ncp; struct neglist *nl; u_char nc_flag; nl = NCP2NEGLIST(oncp); mtx_lock(&nl->nl_lock); /* * For hash iteration. */ vfs_smr_enter(); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * XXX There are no provisions to keep the vnode around, meaning we may * end up promoting a negative entry for a *new* vnode and returning * ENOENT on its account. This is the error we want to return anyway * and promotion is harmless. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (!cache_ncp_canuse(ncp)) { goto out_abort; } cache_neg_promote_locked(ncp); cache_neg_hit_finish(ncp); vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (true); out_abort: vfs_smr_exit(); mtx_unlock(&nl->nl_lock); return (false); } static void cache_neg_promote(struct namecache *ncp) { struct neglist *nl; nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); cache_neg_promote_locked(ncp); mtx_unlock(&nl->nl_lock); } static void cache_neg_insert(struct namecache *ncp) { struct neglist *nl; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst); mtx_unlock(&nl->nl_lock); atomic_add_long(&numneg, 1); } static void cache_neg_remove(struct namecache *ncp) { struct neglist *nl; struct negstate *ns; cache_assert_bucket_locked(ncp); nl = NCP2NEGLIST(ncp); ns = NCP2NEGSTATE(ncp); mtx_lock(&nl->nl_lock); if ((ns->neg_flag & NEG_HOT) != 0) { TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst); nl->nl_hotnum--; } else { TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst); } mtx_unlock(&nl->nl_lock); atomic_subtract_long(&numneg, 1); } static struct neglist * cache_neg_evict_select_list(void) { struct neglist *nl; u_int c; c = atomic_fetchadd_int(&neg_cycle, 1) + 1; nl = &neglists[c % numneglists]; if (!mtx_trylock(&nl->nl_evict_lock)) { counter_u64_add(neg_evict_skipped_contended, 1); return (NULL); } return (nl); } static struct namecache * cache_neg_evict_select_entry(struct neglist *nl) { struct namecache *ncp, *lncp; struct negstate *ns, *lns; int i; mtx_assert(&nl->nl_evict_lock, MA_OWNED); mtx_assert(&nl->nl_lock, MA_OWNED); ncp = TAILQ_FIRST(&nl->nl_list); if (ncp == NULL) return (NULL); lncp = ncp; lns = NCP2NEGSTATE(lncp); for (i = 1; i < 4; i++) { ncp = TAILQ_NEXT(ncp, nc_dst); if (ncp == NULL) break; ns = NCP2NEGSTATE(ncp); if (ns->neg_hit < lns->neg_hit) { lncp = ncp; lns = ns; } } return (lncp); } static bool cache_neg_evict(void) { struct namecache *ncp, *ncp2; struct neglist *nl; struct vnode *dvp; struct mtx *dvlp; struct mtx *blp; uint32_t hash; u_char nlen; bool evicted; nl = cache_neg_evict_select_list(); if (nl == NULL) { return (false); } mtx_lock(&nl->nl_lock); ncp = TAILQ_FIRST(&nl->nl_hotlist); if (ncp != NULL) { cache_neg_demote_locked(ncp); } ncp = cache_neg_evict_select_entry(nl); if (ncp == NULL) { counter_u64_add(neg_evict_skipped_empty, 1); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); return (false); } nlen = ncp->nc_nlen; dvp = ncp->nc_dvp; hash = cache_get_hash(ncp->nc_name, nlen, dvp); dvlp = VP2VNODELOCK(dvp); blp = HASH2BUCKETLOCK(hash); mtx_unlock(&nl->nl_lock); mtx_unlock(&nl->nl_evict_lock); mtx_lock(dvlp); mtx_lock(blp); /* * Note that since all locks were dropped above, the entry may be * gone or reallocated to be something else. */ CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) { if (ncp2 == ncp && ncp2->nc_dvp == dvp && ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0) break; } if (ncp2 == NULL) { counter_u64_add(neg_evict_skipped_missed, 1); ncp = NULL; evicted = false; } else { MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp)); MPASS(blp == NCP2BUCKETLOCK(ncp)); SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(neg_evicted, 1); evicted = true; } mtx_unlock(blp); mtx_unlock(dvlp); if (ncp != NULL) cache_free(ncp); return (evicted); } /* * Maybe evict a negative entry to create more room. * * The ncnegfactor parameter limits what fraction of the total count * can comprise of negative entries. However, if the cache is just * warming up this leads to excessive evictions. As such, ncnegminpct * (recomputed to neg_min) dictates whether the above should be * applied. * * Try evicting if the cache is close to full capacity regardless of * other considerations. */ static bool cache_neg_evict_cond(u_long lnumcache) { u_long lnumneg; if (ncsize - 1000 < lnumcache) goto out_evict; lnumneg = atomic_load_long(&numneg); if (lnumneg < neg_min) return (false); if (lnumneg * ncnegfactor < lnumcache) return (false); out_evict: return (cache_neg_evict()); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; struct vnode *dvp, *vp; dvp = ncp->nc_dvp; vp = ncp->nc_vp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(vp); cache_assert_vnode_locked(dvp); cache_assert_bucket_locked(ncp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp); TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst); if (ncp == vp->v_cache_dd) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name); cache_neg_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == dvp->v_cache_dd) { atomic_store_ptr(&dvp->v_cache_dd, NULL); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; } } } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct mtx *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct mtx *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } mtx_lock(blp); cache_zap_locked(ncp); mtx_unlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct mtx *blp) { struct namecache *rncp; struct mtx *rvlp; cache_assert_bucket_unlocked(ncp); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); mtx_lock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) break; } if (rncp == NULL) goto out_mismatch; if (!(ncp->nc_flag & NCF_NEGATIVE)) rvlp = VP2VNODELOCK(rncp->nc_vp); else rvlp = NULL; if (rvlp != vlp) goto out_mismatch; cache_zap_locked(rncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); atomic_add_long(&zap_bucket_relock_success, 1); return (0); out_mismatch: mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct mtx *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); mtx_unlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; mtx_unlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static __noinline int cache_remove_cnp(struct vnode *dvp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { atomic_store_ptr(&dvp->v_cache_dd, NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); return (1); } /* * XXX note that access here is completely unlocked with no provisions * to keep the hash allocated. If one is sufficiently unlucky a * parallel cache resize can reallocate the hash, unmap backing pages * and cause the empty check below to fault. * * Fixing this has epsilon priority, but can be done with no overhead * for this codepath with sufficient effort. */ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (ncp == NULL) { mtx_unlock(blp); goto out_no_entry; } error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { atomic_add_long(&zap_bucket_fail, 1); goto retry; } counter_u64_add(numposzaps, 1); SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp); cache_free(ncp); return (1); out_no_entry: counter_u64_add(nummisszap, 1); SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp); return (0); } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static int __noinline cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct mtx *dvlp; enum vgetstate vs; int error, ltype; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) != 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { cache_remove_cnp(dvp, cnp); return (0); } retry: dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE2(vfs, namecache, lookup, miss, dvp, ".."); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; if (*vpp == NULL) goto negative_success; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } MPASS(dvp != *vpp); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); vs = vget_prep(*vpp); mtx_unlock(dvlp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: if (__predict_false(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); cache_zap_negative_locked_vnode_kl(ncp, dvp); mtx_unlock(dvlp); cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(dvlp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * Either both tsp and ticks have to be provided or neither of them. * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ static int __noinline cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; enum vgetstate vs; int error; bool whiteout; MPASS((cnp->cn_flags & ISDOTDOT) == 0); MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0); retry: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { mtx_unlock(blp); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); vs = vget_prep(*vpp); mtx_unlock(blp); error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto retry; } return (-1); negative_success: /* * We don't get here with regular lookup apart from corner cases. */ if (__predict_true(cnp->cn_nameiop == CREATE)) { if (cnp->cn_flags & ISLASTCN) { counter_u64_add(numnegzaps, 1); error = cache_zap_locked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { atomic_add_long(&zap_bucket_fail2, 1); goto retry; } cache_free(ncp); return (0); } } whiteout = (ncp->nc_flag & NCF_WHITE); cache_out_ts(ncp, tsp, ticksp); if (cache_neg_hit_prep(ncp)) cache_neg_promote(ncp); else cache_neg_hit_finish(ncp); mtx_unlock(blp); if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); } int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; uint32_t hash; enum vgetstate vs; int error; bool whiteout, neg_promote; u_short nc_flag; MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL)); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp)); } MPASS((cnp->cn_flags & ISDOTDOT) == 0); if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) { cache_remove_cnp(dvp, cnp); return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); vfs_smr_enter(); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { vfs_smr_exit(); SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr); counter_u64_add(nummiss, 1); return (0); } nc_flag = atomic_load_char(&ncp->nc_flag); if (nc_flag & NCF_NEGATIVE) goto negative_success; counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); MPASS(dvp != *vpp); if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto out_fallback; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto out_fallback; } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (error) { *vpp = NULL; goto out_fallback; } return (-1); negative_success: if (cnp->cn_nameiop == CREATE) { if (cnp->cn_flags & ISLASTCN) { vfs_smr_exit(); goto out_fallback; } } cache_out_ts(ncp, tsp, ticksp); whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE); neg_promote = cache_neg_hit_prep(ncp); if (!cache_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); vfs_smr_exit(); goto out_fallback; } if (neg_promote) { vfs_smr_exit(); if (!cache_neg_promote_cond(dvp, cnp, ncp, hash)) goto out_fallback; } else { cache_neg_hit_finish(ncp); vfs_smr_exit(); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); out_fallback: return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp)); } struct celockstate { struct mtx *vlp[3]; struct mtx *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_unlock_vnodes_cel(cel); atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1, struct mtx *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { mtx_lock(blp1); cel->blp[0] = blp1; } mtx_lock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) mtx_unlock(cel->blp[0]); mtx_unlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct mtx *blps[2]; u_char nc_flag; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) break; nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if ((nc_flag & NCF_NEGATIVE) != 0) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (atomic_load_ptr(&dvp->v_cache_dd) == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } atomic_store_ptr(&dvp->v_cache_dd, NULL); cache_enter_unlock(&cel); if (ncp != NULL) cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts; struct nchashhead *ncpp; uint32_t hash; int flag; int len; KASSERT(cnp->cn_namelen <= NAME_MAX, ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen, NAME_MAX)); VNPASS(!VN_IS_DOOMED(dvp), dvp); VNPASS(dvp->v_type != VNON, dvp); if (vp != NULL) { VNPASS(!VN_IS_DOOMED(vp), vp); VNPASS(vp->v_type != VNON, vp); } if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { KASSERT(dvp == vp, ("%s: different vnodes for dot entry (%p; %p)\n", __func__, dvp, vp)); } else { KASSERT(dvp != vp, ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__, cnp->cn_nameptr, dvp)); } #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); if (ncp == NULL) return; cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_neg_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); CK_SLIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { MPASS(cache_ncp_canuse(n2)); if ((n2->nc_flag & NCF_NEGATIVE) != 0) KASSERT(vp == NULL, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, NULL, vp, cnp->cn_nameptr)); else KASSERT(n2->nc_vp == vp, ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]", __func__, n2->nc_vp, vp, cnp->cn_nameptr)); /* * Entries are supposed to be immutable unless in the * process of getting destroyed. Accommodating for * changing timestamps is possible but not worth it. * This should be harmless in terms of correctness, in * the worst case resulting in an earlier expiration. * Alternatively, the found entry can be replaced * altogether. */ MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS))); #if 0 if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } #endif SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name, vp); goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); atomic_thread_fence_rel(); atomic_store_ptr(&dvp->v_cache_dd, ncp); } if (vp != NULL) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } atomic_thread_fence_rel(); atomic_store_ptr(&vp->v_cache_dd, ncp); } else if (vp->v_type != VDIR) { if (vp->v_cache_dd != NULL) { atomic_store_ptr(&vp->v_cache_dd, NULL); } } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { cache_hold_vnode(dvp); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE); cache_neg_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (ndd != NULL) cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); cache_free(ncp); return; } /* * A variant of the above accepting flags. * * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it. * * TODO: this routine is a hack. It blindly removes the old entry, even if it * happens to match and it is doing it in an inefficient manner. It was added * to accommodate NFS which runs into a case where the target for a given name * may change from under it. Note this does nothing to solve the following * race: 2 callers of cache_enter_time_flags pass a different target vnode for * the same [dvp, cnp]. It may be argued that code doing this is broken. */ void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags) { MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0); if (flags & VFS_CACHE_DROPOLD) cache_remove_cnp(dvp, cnp); cache_enter_time(dvp, vp, cnp, tsp, dtsp); } static u_long cache_roundup_2(u_long val) { u_long res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; cache_recalc_neg_min(); nchashtbl = nchinittbl(ncsize, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF); mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); TAILQ_INIT(&neglists[i].nl_hotlist); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } /* * Induce transient cache misses for lockless operation in cache_lookup() by * using a temporary hash table. * * This will force a fs lookup. * * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time * to observe all CPUs not performing the lookup. */ static void cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash) { MPASS(temphash < nchash); /* * Change the size. The new size is smaller and can safely be used * against the existing table. All lookups which now hash wrong will * result in a cache miss, which all callers are supposed to know how * to handle. */ atomic_store_long(&nchash, temphash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated hash value, but they still * see the old table. */ atomic_store_ptr(&nchashtbl, temptbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } /* * Set the new hash table. * * Similarly to cache_changesize_set_temp(), this has to synchronize against * lockless operation in cache_lookup(). */ static void cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash) { MPASS(nchash < new_hash); /* * Change the pointer first. This wont result in out of bounds access * since the temporary table is guaranteed to be smaller. */ atomic_store_ptr(&nchashtbl, new_tbl); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated pointer value, but they * still see the old size. */ atomic_store_long(&nchash, new_hash); atomic_thread_fence_rel(); vfs_smr_synchronize(); /* * At this point everyone sees the updated table pointer and size pair. */ } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl; u_long new_nchash, old_nchash, temphash; struct namecache *ncp; uint32_t hash; u_long newncsize; u_long i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } temptbl = nchinittbl(1, &temphash); /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; cache_changesize_set_temp(temptbl, temphash); for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash); } } ncsize = newncsize; cache_recalc_neg_min(); cache_changesize_set_new(new_nchashtbl, new_nchash); cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); ncfreetbl(temptbl); } /* * Remove all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp; struct mtx *vlp, *vlp2; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_lock(vlp); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); cache_free_batch(&batch); } /* * Opportunistic check to see if there is anything to do. */ static bool cache_has_entries(struct vnode *vp) { if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && atomic_load_ptr(&vp->v_cache_dd) == NULL) return (false); return (true); } void cache_purge(struct vnode *vp) { SDT_PROBE1(vfs, namecache, purge, done, vp); if (!cache_has_entries(vp)) return; cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } /* * Serialize against a potential thread doing cache_purge. */ vlp = VP2VNODELOCK(vp); mtx_wait_unlocked(vlp); if (cache_has_entries(vp)) { cache_purge_impl(vp); return; } return; } /* * Remove all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { struct cache_freebatch batch; struct namecache *ncp, *nnp; struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&batch); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&batch, ncp, nc_dst); } mtx_unlock(vlp); cache_free_batch(&batch); } /* * Entry points for modifying VOP operations. */ void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { ASSERT_VOP_IN_SEQC(fdvp); ASSERT_VOP_IN_SEQC(fvp); ASSERT_VOP_IN_SEQC(tdvp); if (tvp != NULL) ASSERT_VOP_IN_SEQC(tvp); cache_purge(fvp); if (tvp != NULL) { cache_purge(tvp); KASSERT(!cache_remove_cnp(tdvp, tcnp), ("%s: lingering negative entry", __func__)); } else { cache_remove_cnp(tdvp, tcnp); } /* * TODO * * Historically renaming was always purging all revelang entries, * but that's quite wasteful. In particular turns out that in many cases * the target file is immediately accessed after rename, inducing a cache * miss. * * Recode this to reduce relocking and reuse the existing entry (if any) * instead of just removing it above and allocating a new one here. */ cache_enter(tdvp, fvp, tcnp); } void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { ASSERT_VOP_IN_SEQC(dvp); ASSERT_VOP_IN_SEQC(vp); cache_purge(vp); } #ifdef INVARIANTS /* * Validate that if an entry exists it matches. */ void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct namecache *ncp; struct mtx *blp; uint32_t hash; hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); if (CK_SLIST_EMPTY(NCHHASH(hash))) return; blp = HASH2BUCKETLOCK(hash); mtx_lock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) { if (ncp->nc_vp != vp) panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n", __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp); } } mtx_unlock(blp); } void cache_assert_no_entries(struct vnode *vp) { VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp); VNPASS(LIST_EMPTY(&vp->v_cache_src), vp); VNPASS(vp->v_cache_dd == NULL, vp); } #endif /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp) { struct vnode *vp, *mvp; size_t visited __sdt_used, purged __sdt_used; visited = purged = 0; /* * Somewhat wasteful iteration over all vnodes. Would be better to * support filtering and avoid the interlock to begin with. */ MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { visited++; if (!cache_has_entries(vp)) { VI_UNLOCK(vp); continue; } vholdl(vp); VI_UNLOCK(vp); cache_purge(vp); purged++; vdrop(vp); } SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } /* * Canonicalize a path by walking it forward and back. * * BUGS: * - Nothing guarantees the integrity of the entire chain. Consider the case * where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of * "foo" into "quux" during the backwards walk. The result will be * "quux/bar/baz/qux", which could not have been obtained by an incremental * walk in userspace. Moreover, the path we return is inaccessible if the * calling thread lacks permission to traverse "quux". */ static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR && (nd.ni_vp->v_vflag & VV_ROOT) != 0) { struct vnode *covered_vp; /* * This happens if vp is a file mount. The call to * vn_fullpath_hardlink can panic if path resolution can't be * handled without the directory. * * To resolve this, we find the vnode which was mounted on - * this should have a unique global path since we disallow * mounting on linked files. */ error = vn_lock(nd.ni_vp, LK_SHARED); if (error != 0) goto out; covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered; vref(covered_vp); VOP_UNLOCK(nd.ni_vp); error = vn_fullpath(covered_vp, &retbuf, &freebuf); vrele(covered_vp); } else { error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf, &freebuf, &size); } if (error == 0) { size_t len; len = strlen(retbuf) + 1; if (size < len) error = ENAMETOOLONG; else if (pathseg == UIO_USERSPACE) error = copyout(retbuf, buf, len); else memcpy(buf, retbuf, len); free(freebuf, M_TEMP); } out: vrele(nd.ni_vp); vrele(nd.ni_dvp); NDFREE_PNBUF(&nd); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vp == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); vfs_smr_enter(); error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen); } if (error == 0) *freebuf = buf; else free(buf, M_TEMP); return (error); } static struct namecache * vn_dd_from_dst(struct vnode *vp) { struct namecache *ncp; cache_assert_vnode_locked(vp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) return (ncp); } return (NULL); } int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); ncp = (*vp)->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) { KASSERT(ncp == vn_dd_from_dst(*vp), ("%s: mismatch for dd entry (%p != %p)", __func__, ncp, vn_dd_from_dst(*vp))); } else { ncp = vn_dd_from_dst(*vp); } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; bool slash_prefixed; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; slash_prefixed = true; if (addend == 0) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; slash_prefixed = false; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); error = vn_vptocnp(&vp, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static void __inline cache_rev_failed_impl(int *reason, int line) { *reason = line; } #define cache_rev_failed(var) cache_rev_failed_impl((var), __LINE__) static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *tvp; struct mount *mp; struct namecache *ncp; size_t orig_buflen; int reason; int error; #ifdef KDTRACE_HOOKS int i; #endif seqc_t vp_seqc, tvp_seqc; u_char nc_flag; VFS_SMR_ASSERT_ENTERED(); if (!atomic_load_char(&cache_fast_lookup_enabled)) { vfs_smr_exit(); return (-1); } orig_buflen = *buflen; if (addend == 0) { MPASS(*buflen >= 2); *buflen -= 1; buf[*buflen] = '\0'; } if (vp == rdir || vp == rootvnode) { if (addend == 0) { *buflen -= 1; buf[*buflen] = '/'; } goto out_ok; } #ifdef KDTRACE_HOOKS i = 0; #endif error = -1; ncp = NULL; /* for sdt probe down below */ vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } for (;;) { #ifdef KDTRACE_HOOKS i++; #endif if ((vp->v_vflag & VV_ROOT) != 0) { mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) { cache_rev_failed(&reason); goto out_abort; } tvp = atomic_load_ptr(&mp->mnt_vnodecovered); tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; continue; } ncp = atomic_load_consume_ptr(&vp->v_cache_dd); if (ncp == NULL) { cache_rev_failed(&reason); goto out_abort; } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { cache_rev_failed(&reason); goto out_abort; } if (ncp->nc_nlen >= *buflen) { cache_rev_failed(&reason); error = ENOMEM; goto out_abort; } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); *buflen -= 1; buf[*buflen] = '/'; tvp = ncp->nc_dvp; tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(tvp_seqc)) { cache_rev_failed(&reason); goto out_abort; } if (!vn_seqc_consistent(vp, vp_seqc)) { cache_rev_failed(&reason); goto out_abort; } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) { cache_rev_failed(&reason); goto out_abort; } if (!cache_ncp_canuse(ncp)) { cache_rev_failed(&reason); goto out_abort; } vp = tvp; vp_seqc = tvp_seqc; if (vp == rdir || vp == rootvnode) break; } out_ok: vfs_smr_exit(); *retbuf = buf + *buflen; *buflen = orig_buflen - *buflen + addend; SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf); return (0); out_abort: *buflen = orig_buflen; SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i); vfs_smr_exit(); return (error); } static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen, addend; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); addend = 0; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; addend = orig_buflen - *buflen; } return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * * Since the namecache does not track hardlinks, the caller is expected to * first look up the target vnode with WANTPARENT flag passed to namei to get * dvp and vp. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * following names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ int vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp, const char *hrdl_name, size_t hrdl_name_length, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; size_t addend; int error; __enum_uint8(vtype) type; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; buf = malloc(*buflen, M_TEMP, M_WAITOK); addend = 0; /* * Check for VBAD to work around the vp_crossmp bug in lookup(). * * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be * set to mount point's root vnode while ni_dvp will be vp_crossmp. * If the type is VDIR (like in this very case) we can skip looking * at ni_dvp in the first place. However, since vnodes get passed here * unlocked the target may transition to doomed state (type == VBAD) * before we get to evaluate the condition. If this happens, we will * populate part of the buffer and descend to vn_fullpath_dir with * vp == vp_crossmp. Prevent the problem by checking for VBAD. */ type = atomic_load_8(&vp->v_type); if (type == VBAD) { error = ENOENT; goto out_bad; } if (type != VDIR) { addend = hrdl_name_length + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length); tmpbuf[addend - 1] = '\0'; vp = dvp; } vfs_smr_enter(); pwd = pwd_get_smr(); error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); VFS_SMR_ASSERT_NOT_ENTERED(); if (error < 0) { pwd = pwd_hold(curthread); vref(vp); error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen, addend); pwd_drop(pwd); } if (error != 0) goto out_bad; *freebuf = buf; return (0); out_bad: free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } /* * This is similar to vn_path_to_global_path but allows for regular * files which may not be present in the cache. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. */ int vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp, struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name, size_t leaf_length) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; size_t len; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* * Construct global filesystem path from dvp, vp and leaf * name. */ VOP_UNLOCK(vp); len = pathlen; error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length, &rpath, &fbuf, &len); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE_PNBUF(&nd); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif static int cache_fast_lookup = 1; #define CACHE_FPL_FAILED -2020 static int cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v) { vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n"); panic("no proper vop_fplookup_vexec"); } static int cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v) { vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n"); panic("no proper vop_fplookup_symlink"); } void cache_vop_vector_register(struct vop_vector *v) { size_t ops; ops = 0; if (v->vop_fplookup_vexec != NULL) { ops++; } if (v->vop_fplookup_symlink != NULL) { ops++; } if (ops == 2) { return; } if (ops == 0) { v->vop_fplookup_vexec = cache_vop_bad_vexec; v->vop_fplookup_symlink = cache_vop_bad_symlink; return; } printf("%s: invalid vop vector %p -- either all or none fplookup vops " "need to be provided", __func__, v); if (v->vop_fplookup_vexec == NULL) { printf("%s: missing vop_fplookup_vexec\n", __func__); } if (v->vop_fplookup_symlink == NULL) { printf("%s: missing vop_fplookup_symlink\n", __func__); } panic("bad vop vector %p", v); } #ifdef INVARIANTS void cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops) { if (mp == NULL) return; if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return; if (vops->vop_fplookup_vexec == NULL || vops->vop_fplookup_vexec == cache_vop_bad_vexec) panic("bad vop_fplookup_vexec on vector %p for filesystem %s", vops, mp->mnt_vfc->vfc_name); if (vops->vop_fplookup_symlink == NULL || vops->vop_fplookup_symlink == cache_vop_bad_symlink) panic("bad vop_fplookup_symlink on vector %p for filesystem %s", vops, mp->mnt_vfc->vfc_name); } #endif void cache_fast_lookup_enabled_recalc(void) { int lookup_flag; int mac_on; #ifdef MAC mac_on = mac_vnode_check_lookup_enabled(); mac_on |= mac_vnode_check_readlink_enabled(); #else mac_on = 0; #endif lookup_flag = atomic_load_int(&cache_fast_lookup); if (lookup_flag && !mac_on) { atomic_store_char(&cache_fast_lookup_enabled, true); } else { atomic_store_char(&cache_fast_lookup_enabled, false); } } static int syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS) { int error, old; old = atomic_load_int(&cache_fast_lookup); error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup)) cache_fast_lookup_enabled_recalc(); return (error); } SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE, &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", ""); /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_outer { size_t ni_pathlen; int cn_flags; }; struct nameidata_saved { #ifdef INVARIANTS char *cn_nameptr; size_t ni_pathlen; #endif }; #ifdef INVARIANTS struct cache_fpl_debug { size_t ni_pathlen; }; #endif struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; char *nulchar; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; uint32_t hash; struct nameidata_saved snd; struct nameidata_outer snd_outer; int line; enum cache_fpl_status status:8; bool in_smr; bool fsearch; struct pwd **pwd; #ifdef INVARIANTS struct cache_fpl_debug debug; #endif }; static bool cache_fplookup_mp_supported(struct mount *mp); static bool cache_fplookup_is_mp(struct cache_fpl *fpl); static int cache_fplookup_cross_mount(struct cache_fpl *fpl); static int cache_fplookup_partial_setup(struct cache_fpl *fpl); static int cache_fplookup_skip_slashes(struct cache_fpl *fpl); static int cache_fplookup_trailingslash(struct cache_fpl *fpl); static void cache_fpl_pathlen_dec(struct cache_fpl *fpl); static void cache_fpl_pathlen_inc(struct cache_fpl *fpl); static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n); static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n); static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; } static struct vnode * cache_fpl_handle_root(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } return (ndp->ni_rootdir); } static void cache_fpl_checkpoint_outer(struct cache_fpl *fpl) { fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen; fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags; } static void cache_fpl_checkpoint(struct cache_fpl *fpl) { #ifdef INVARIANTS fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; fpl->snd.ni_pathlen = fpl->debug.ni_pathlen; #endif } static void cache_fpl_restore_partial(struct cache_fpl *fpl) { fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags; #ifdef INVARIANTS fpl->debug.ni_pathlen = fpl->snd.ni_pathlen; #endif } static void cache_fpl_restore_abort(struct cache_fpl *fpl) { cache_fpl_restore_partial(fpl); /* * It is 0 on entry by API contract. */ fpl->ndp->ni_resflags = 0; fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf; fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) static void cache_fpl_assert_status(struct cache_fpl *fpl) { switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_DESTROYED: case CACHE_FPL_STATUS_ABORTED: case CACHE_FPL_STATUS_PARTIAL: case CACHE_FPL_STATUS_HANDLED: break; } } #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #define cache_fpl_assert_status(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted_early(x) cache_fpl_aborted_early_impl((x), __LINE__) static int __noinline cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; if (fpl->in_smr) cache_fpl_smr_exit(fpl); cache_fpl_restore_abort(fpl); /* * Resolving symlinks overwrites data passed by the caller. * Let namei know. */ if (ndp->ni_loopcnt > 0) { fpl->status = CACHE_FPL_STATUS_DESTROYED; cache_fpl_cleanup_cnp(cnp); } return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int __noinline cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (cache_fplookup_partial_setup(fpl)); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (0); } #define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) static int cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); MPASS(error != 0); MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; fpl->dvp = NULL; fpl->tvp = NULL; return (error); } #define cache_fpl_handled_error(x, e) cache_fpl_handled_error_impl((x), (e), __LINE__) static bool cache_fpl_terminated(struct cache_fpl *fpl) { return (fpl->status != CACHE_FPL_STATUS_UNSET); } #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \ ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \ OPENWRITE | WANTIOCTLCAPS) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_istrailingslash(struct cache_fpl *fpl) { MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf); return (*(fpl->nulchar - 1) == '/'); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = curthread; if (!atomic_load_char(&cache_fast_lookup_enabled)) { cache_fpl_aborted_early(fpl); return (false); } if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted_early(fpl); return (false); } if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) { cache_fpl_aborted_early(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted_early(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted_early(fpl); return (false); } return (true); } static int __noinline cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp) { struct nameidata *ndp; struct componentname *cnp; int error; bool fsearch; ndp = fpl->ndp; cnp = fpl->cnp; error = fgetvp_lookup_smr(ndp, vpp, &fsearch); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } fpl->fsearch = fsearch; if ((*vpp)->v_type != VDIR) { if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } } return (0); } static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct vnode *dvp; cnp = fpl->cnp; dvp = fpl->dvp; cache_fpl_smr_exit(fpl); if (cache_neg_promote_cond(dvp, cnp, oncp, hash)) return (cache_fpl_handled_error(fpl, ENOENT)); else return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; pwd = *(fpl->pwd); dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; if (!pwd_hold_smr(pwd)) { return (cache_fpl_aborted(fpl)); } /* * Note that seqc is checked before the vnode is locked, so by * the time regular lookup gets to it it may have moved. * * Ultimately this does not affect correctness, any lookup errors * are userspace racing with itself. It is guaranteed that any * path which ultimately gets found could also have been found * by regular lookup going all the way in absence of concurrent * modifications. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore_partial(fpl); #ifdef INVARIANTS if (cnp->cn_nameptr != fpl->snd.cn_nameptr) { panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__, cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf); } #endif ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; /* * Skip potential extra slashes parsing did not take care of. * cache_fplookup_skip_slashes explains the mechanism. */ if (__predict_false(*(cnp->cn_nameptr) == '/')) { do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl)); } /* * They want to possibly modify the state of the namecache. */ static int __noinline cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct nameidata *ndp __diagused; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; struct mount *mp; seqc_t dvp_seqc; int error; bool docache; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS(*(cnp->cn_nameptr) != '/'); MPASS(cache_fpl_islastcn(ndp)); if ((cnp->cn_flags & LOCKPARENT) == 0) MPASS((cnp->cn_flags & WANTPARENT) != 0); MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME); MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) docache = false; /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { cache_fpl_smr_exit(fpl); /* * Original code keeps not checking for CREATE which * might be a bug. For now let the old lookup decide. */ if (cnp->cn_nameiop == CREATE) { return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled_error(fpl, EROFS)); } if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EEXIST)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. * * XXX At least UFS requires its lookup routine to be called for * the last path component, which leads to some level of complication * and inefficiency: * - the target routine always locks the target vnode, but our caller * may not need it locked * - some of the VOP machinery asserts that the parent is locked, which * once more may be not required * * TODO: add a flag for filesystems which don't need this. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_EXCLUSIVE); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; cnp->cn_flags |= ISLASTCN; if (docache) cnp->cn_flags |= MAKEENTRY; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; cnp->cn_lkflags = LK_EXCLUSIVE; error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } /* * There are very hairy corner cases concerning various flag combinations * and locking state. In particular here we only hold one lock instead of * two. * * Skip the complexity as it is of no significance for normal workloads. */ if (__predict_false(tvp == dvp)) { vput(dvp); vrele(tvp); return (cache_fpl_aborted(fpl)); } /* * If they want the symlink itself we are fine, but if they want to * follow it regular lookup has to be engaged. */ if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } /* * Since we expect this to be the terminal vnode it should almost never * be a mount point. */ if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & FAILIFEXISTS) != 0) { vput(dvp); vput(tvp); return (cache_fpl_handled_error(fpl, EEXIST)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_modifying(struct cache_fpl *fpl) { struct nameidata *ndp; ndp = fpl->ndp; if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } return (cache_fplookup_final_modifying(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED || fpl->status == CACHE_FPL_STATUS_DESTROYED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; MPASS(*(cnp->cn_nameptr) != '/'); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } /* * Comment from locked lookup: * Check for degenerate name (e.g. / or "") which is a way of talking about a * directory, e.g. like "/." or ".". */ static int __noinline cache_fplookup_degenerate(struct cache_fpl *fpl) { struct componentname *cnp; struct vnode *dvp; enum vgetstate dvs; int error, lkflags; #ifdef INVARIANTS char *cp; #endif fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; #ifdef INVARIANTS for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) { KASSERT(*cp == '/', ("%s: encountered non-slash; string [%s]\n", __func__, cnp->cn_pnbuf)); } #endif if (__predict_false(cnp->cn_nameiop != LOOKUP)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EISDIR)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) { return (cache_fplookup_final_withparent(fpl)); } dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(dvp, lkflags, dvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_emptypath(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate tvs; struct vnode *tvp; int error, lkflags; fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; MPASS(*cnp->cn_pnbuf == '\0'); if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0); tvs = vget_prep_smr(tvp); cache_fpl_smr_exit(fpl); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } ndp->ni_resflags |= NIRES_EMPTYPATH; return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_noentry(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; MPASS((cnp->cn_flags & MAKEENTRY) == 0); MPASS((cnp->cn_flags & ISDOTDOT) == 0); if (cnp->cn_nameiop == LOOKUP) MPASS((cnp->cn_flags & NOCACHE) == 0); MPASS(!cache_fpl_isdotdot(cnp)); /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } if (cnp->cn_nameptr[0] == '/') { return (cache_fplookup_skip_slashes(fpl)); } if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } if (cnp->cn_nameptr[0] == '\0') { if (fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } return (cache_fplookup_trailingslash(fpl)); } if (cnp->cn_nameiop != LOOKUP) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } /* * Only try to fill in the component if it is the last one, * otherwise not only there may be several to handle but the * walk may be complicated. */ if (!cache_fpl_islastcn(ndp)) { return (cache_fpl_partial(fpl)); } /* * Regular lookup nulifies the slash, which we don't do here. * Don't take chances with filesystem routines seeing it for * the last entry. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_partial(fpl)); } /* * Secure access to dvp; check cache_fplookup_partial_setup for * reasoning. */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } error = vn_lock(dvp, LK_SHARED); if (__predict_false(error != 0)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } tvp = NULL; /* * TODO: provide variants which don't require locking either vnode. */ cnp->cn_flags |= ISLASTCN | MAKEENTRY; cnp->cn_lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) { cnp->cn_lkflags = LK_EXCLUSIVE; } error = VOP_LOOKUP(dvp, &tvp, cnp); switch (error) { case EJUSTRETURN: case 0: break; case ENOTDIR: case ENOENT: vput(dvp); return (cache_fpl_handled_error(fpl, error)); default: vput(dvp); return (cache_fpl_aborted(fpl)); } fpl->tvp = tvp; if (tvp == NULL) { MPASS(error == EJUSTRETURN); if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } if (tvp->v_type == VLNK) { if ((cnp->cn_flags & FOLLOW) != 0) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } } if (__predict_false(cache_fplookup_is_mp(fpl))) { vput(dvp); vput(tvp); return (cache_fpl_aborted(fpl)); } if ((cnp->cn_flags & LOCKLEAF) == 0) { VOP_UNLOCK(tvp); } if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { vput(dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0) { VOP_UNLOCK(dvp); } return (cache_fpl_handled(fpl)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { int error; MPASS(!seqc_in_modify(fpl->dvp_seqc)); if (__predict_false(fpl->dvp->v_type != VDIR)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Just re-assign the value. seqc will be checked later for the first * non-dot path component in line and/or before deciding to return the * vnode. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *dvp; struct prison *pr; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; MPASS(cache_fpl_isdotdot(cnp)); /* * XXX this is racy the same way regular lookup is */ for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dvp == pr->pr_root) break; if (dvp == ndp->ni_rootdir || dvp == ndp->ni_topdir || dvp == rootvnode || pr != NULL) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_partial(fpl)); } if (__predict_false(dvp->v_type != VDIR)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } ncp = atomic_load_consume_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } /* * Acquire fence provided by vn_seqc_read_any above. */ if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) { return (cache_fpl_aborted(fpl)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_aborted(fpl)); } return (0); } static int __noinline cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash) { u_char nc_flag __diagused; bool neg_promote; #ifdef INVARIANTS nc_flag = atomic_load_char(&ncp->nc_flag); MPASS((nc_flag & NCF_NEGATIVE) != 0); #endif /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { fpl->tvp = NULL; return (cache_fplookup_modifying(fpl)); } neg_promote = cache_neg_hit_prep(ncp); if (!cache_fpl_neg_ncp_canuse(ncp)) { cache_neg_hit_abort(ncp); return (cache_fpl_partial(fpl)); } if (neg_promote) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } cache_neg_hit_finish(ncp); cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOENT)); } /* * Resolve a symlink. Called by filesystem-specific routines. * * Code flow is: * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve */ int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len) { struct nameidata *ndp; struct componentname *cnp; size_t adjust; ndp = fpl->ndp; cnp = fpl->cnp; if (__predict_false(len == 0)) { return (ENOENT); } if (__predict_false(len > MAXPATHLEN - 2)) { if (cache_fpl_istrailingslash(fpl)) { return (EAGAIN); } } ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1; #ifdef INVARIANTS if (ndp->ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) { return (ENAMETOOLONG); } if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) { return (ELOOP); } adjust = len; if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen); } else { if (cache_fpl_istrailingslash(fpl)) { adjust = len + 1; cnp->cn_pnbuf[len] = '/'; cnp->cn_pnbuf[len + 1] = '\0'; } else { cnp->cn_pnbuf[len] = '\0'; } } bcopy(string, cnp->cn_pnbuf, len); ndp->ni_pathlen += adjust; cache_fpl_pathlen_add(fpl, adjust); cnp->cn_nameptr = cnp->cn_pnbuf; fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl->tvp = NULL; return (0); } static int __noinline cache_fplookup_symlink(struct cache_fpl *fpl) { struct mount *mp; struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp, *tvp; struct pwd *pwd; int error; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; tvp = fpl->tvp; pwd = *(fpl->pwd); if (cache_fpl_islastcn(ndp)) { if ((cnp->cn_flags & FOLLOW) == 0) { return (cache_fplookup_final(fpl)); } } mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } /* * Note this check races against setting the flag just like regular * lookup. */ if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, EACCES)); } error = VOP_FPLOOKUP_SYMLINK(tvp, fpl); if (__predict_false(error != 0)) { switch (error) { case EAGAIN: return (cache_fpl_partial(fpl)); case ENOENT: case ENAMETOOLONG: case ELOOP: cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, error)); default: return (cache_fpl_aborted(fpl)); } } if (*(cnp->cn_nameptr) == '/') { fpl->dvp = cache_fpl_handle_root(fpl); fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } /* * The main loop assumes that ->dvp points to a vnode belonging * to a filesystem which can do lockless lookup, but the absolute * symlink can be wandering off to one which does not. */ mp = atomic_load_ptr(&fpl->dvp->v_mount); if (__predict_false(mp == NULL)) { return (cache_fpl_aborted(fpl)); } if (!cache_fplookup_mp_supported(mp)) { cache_fpl_checkpoint(fpl); return (cache_fpl_partial(fpl)); } if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) { return (cache_fpl_aborted(fpl)); } } return (0); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; int error; cnp = fpl->cnp; dvp = fpl->dvp; hash = fpl->hash; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) { return (cache_fplookup_dot(fpl)); } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { return (cache_fplookup_dotdot(fpl)); } } MPASS(!cache_fpl_isdotdot(cnp)); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } if (__predict_false(ncp == NULL)) { return (cache_fplookup_noentry(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { return (cache_fplookup_neg(fpl, ncp, hash)); } if (!cache_ncp_canuse(ncp)) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); error = 0; if (cache_fplookup_is_mp(fpl)) { error = cache_fplookup_cross_mount(fpl); } return (error); } static bool cache_fplookup_mp_supported(struct mount *mp) { MPASS(mp != NULL); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct mount_pcpu *mpcpu, *prev_mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp, mpcpu)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } prev_mp = mp; prev_mpcpu = mpcpu; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp, prev_mpcpu); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static int __noinline cache_fplookup_cross_mount(struct cache_fpl *fpl) { struct mount *mp; struct mount_pcpu *mpcpu; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp == NULL)) { return (0); } if (!vfs_op_thread_enter_crit(mp, mpcpu)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (__predict_false(vp == NULL)) { vfs_op_thread_exit_crit(mp, mpcpu); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); vfs_op_thread_exit_crit(mp, mpcpu); if (seqc_in_modify(vp_seqc)) { return (cache_fpl_partial(fpl)); } mp = atomic_load_ptr(&vp->v_mountedhere); if (__predict_false(mp != NULL)) { /* * There are possibly more mount points on top. * Normally this does not happen so for simplicity just start * over. */ return (cache_fplookup_climb_mount(fpl)); } fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } /* * Check if a vnode is mounted on. */ static bool cache_fplookup_is_mp(struct cache_fpl *fpl) { struct vnode *vp; vp = fpl->tvp; return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0); } /* * Parse the path. * * The code was originally copy-pasted from regular lookup and despite * clean ups leaves performance on the table. Any modifications here * must take into account that in case off fallback the resulting * nameidata state has to be compatible with the original. */ /* * Debug ni_pathlen tracking. */ #ifdef INVARIANTS static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen += n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { fpl->debug.ni_pathlen -= n; KASSERT(fpl->debug.ni_pathlen <= PATH_MAX, ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen)); } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { cache_fpl_pathlen_add(fpl, 1); } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { cache_fpl_pathlen_sub(fpl, 1); } #else static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n) { } static void cache_fpl_pathlen_inc(struct cache_fpl *fpl) { } static void cache_fpl_pathlen_dec(struct cache_fpl *fpl) { } #endif static void cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct vnode *dvp; char *cp; uint32_t hash; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * Find the end of this path component, it is either / or nul. * * Store / as a temporary sentinel so that we only have one character * to test for. Pathnames tend to be short so this should not be * resulting in cache misses. * * TODO: fix this to be word-sized. */ MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf); KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar, ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n", __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1], fpl->nulchar, cnp->cn_pnbuf)); KASSERT(*fpl->nulchar == '\0', ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar, cnp->cn_pnbuf)); hash = cache_get_hash_iter_start(dvp); *fpl->nulchar = '/'; for (cp = cnp->cn_nameptr; *cp != '/'; cp++) { KASSERT(*cp != '\0', ("%s: encountered unexpected nul; string [%s]\n", __func__, cnp->cn_nameptr)); hash = cache_get_hash_iter(*cp, hash); continue; } *fpl->nulchar = '\0'; fpl->hash = cache_get_hash_iter_finish(hash); cnp->cn_namelen = cp - cnp->cn_nameptr; cache_fpl_pathlen_sub(fpl, cnp->cn_namelen); #ifdef INVARIANTS /* * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since * we are going to fail this lookup with ENAMETOOLONG (see below). */ if (cnp->cn_namelen <= NAME_MAX) { if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) { panic("%s: mismatched hash for [%s] len %ld", __func__, cnp->cn_nameptr, cnp->cn_namelen); } } #endif /* * Hack: we have to check if the found path component's length exceeds * NAME_MAX. However, the condition is very rarely true and check can * be elided in the common case -- if an entry was found in the cache, * then it could not have been too long to begin with. */ ndp->ni_next = cp; } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; KASSERT(*(cnp->cn_nameptr) == '/', ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__, cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf)); cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } /* * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry. * * Lockless lookup tries to elide checking for spurious slashes and should they * be present is guaranteed to fail to find an entry. In this case the caller * must check if the name starts with a slash and call this routine. It is * going to fast forward across the spurious slashes and set the state up for * retry. */ static int __noinline cache_fplookup_skip_slashes(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; MPASS(*(cnp->cn_nameptr) == '/'); do { cnp->cn_nameptr++; cache_fpl_pathlen_dec(fpl); } while (*(cnp->cn_nameptr) == '/'); /* * Go back to one slash so that cache_fplookup_parse_advance has * something to skip. */ cnp->cn_nameptr--; cache_fpl_pathlen_inc(fpl); /* * cache_fplookup_parse_advance starts from ndp->ni_next */ ndp->ni_next = cnp->cn_nameptr; /* * See cache_fplookup_dot. */ fpl->tvp = fpl->dvp; fpl->tvp_seqc = fpl->dvp_seqc; return (0); } /* * Handle trailing slashes (e.g., "foo/"). * * If a trailing slash is found the terminal vnode must be a directory. * Regular lookup shortens the path by nulifying the first trailing slash and * sets the TRAILINGSLASH flag to denote this took place. There are several * checks on it performed later. * * Similarly to spurious slashes, lockless lookup handles this in a speculative * manner relying on an invariant that a non-directory vnode will get a miss. * In this case cn_nameptr[0] == '\0' and cn_namelen == 0. * * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/" * and denotes this is the last path component, which avoids looping back. * * Only plain lookups are supported for now to restrict corner cases to handle. */ static int __noinline cache_fplookup_trailingslash(struct cache_fpl *fpl) { #ifdef INVARIANTS size_t ni_pathlen; #endif struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *tvp; char *cn_nameptr_orig, *cn_nameptr_slash; seqc_t tvp_seqc; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; MPASS(fpl->dvp == fpl->tvp); KASSERT(cache_fpl_istrailingslash(fpl), ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1, cnp->cn_pnbuf)); KASSERT(cnp->cn_nameptr[0] == '\0', ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0], cnp->cn_pnbuf)); KASSERT(cnp->cn_namelen == 0, ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen, cnp->cn_pnbuf)); MPASS(cnp->cn_nameptr > cnp->cn_pnbuf); if (cnp->cn_nameiop != LOOKUP) { return (cache_fpl_aborted(fpl)); } if (__predict_false(tvp->v_type != VDIR)) { if (!vn_seqc_consistent(tvp, tvp_seqc)) { return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENOTDIR)); } /* * Denote the last component. */ ndp->ni_next = &cnp->cn_nameptr[0]; MPASS(cache_fpl_islastcn(ndp)); /* * Unwind trailing slashes. */ cn_nameptr_orig = cnp->cn_nameptr; while (cnp->cn_nameptr >= cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] != '/') { break; } } /* * Unwind to the beginning of the path component. * * Note the path may or may not have started with a slash. */ cn_nameptr_slash = cnp->cn_nameptr; while (cnp->cn_nameptr > cnp->cn_pnbuf) { cnp->cn_nameptr--; if (cnp->cn_nameptr[0] == '/') { break; } } if (cnp->cn_nameptr[0] == '/') { cnp->cn_nameptr++; } cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1; cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr); cache_fpl_checkpoint(fpl); #ifdef INVARIANTS ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1; if (ni_pathlen != fpl->debug.ni_pathlen) { panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n", __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar, cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf); } #endif /* * If this was a "./" lookup the parent directory is already correct. */ if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) { return (0); } /* * Otherwise we need to look it up. */ tvp = fpl->tvp; ncp = atomic_load_consume_ptr(&tvp->v_cache_dd); if (__predict_false(ncp == NULL)) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { return (cache_fpl_aborted(fpl)); } fpl->dvp = ncp->nc_dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { struct componentname *cnp; struct vnode *dvp; seqc_t dvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; /* * Hack: delayed empty path checking. */ if (cnp->cn_pnbuf[0] == '\0') { return (cache_fplookup_emptypath(fpl)); } /* * TODO: Due to ignoring trailing slashes lookup will perform a * permission check on the last dir when it should not be doing it. It * may fail, but said failure should be ignored. It is possible to fix * it up fully without resorting to regular lookup, but for now just * abort. */ if (cache_fpl_istrailingslash(fpl)) { return (cache_fpl_aborted(fpl)); } /* * Hack: delayed degenerate path checking. */ if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) { return (cache_fplookup_degenerate(fpl)); } /* * Hack: delayed name len checking. */ if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled_error(fpl, ENAMETOOLONG)); } /* * Hack: they may be looking up foo/bar, where foo is not a directory. * In such a case we need to return ENOTDIR, but we may happen to get * here with a different error. */ if (dvp->v_type != VDIR) { error = ENOTDIR; } /* * Hack: handle O_SEARCH. * * Open Group Base Specifications Issue 7, 2018 edition states: * * If the access mode of the open file description associated with the * file descriptor is not O_SEARCH, the function shall check whether * directory searches are permitted using the current permissions of * the directory underlying the file descriptor. If the access mode is * O_SEARCH, the function shall not perform the check. * * * Regular lookup tests for the NOEXECCHECK flag for every path * component to decide whether to do the permission check. However, * since most lookups never have the flag (and when they do it is only * present for the first path component), lockless lookup only acts on * it if there is a permission problem. Here the flag is represented * with a boolean so that we don't have to clear it on the way out. * * For simplicity this always aborts. * TODO: check if this is the first lookup and ignore the permission * problem. Note the flag has to survive fallback (if it happens to be * performed). */ if (fpl->fsearch) { return (cache_fpl_aborted(fpl)); } switch (error) { case EAGAIN: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_partial(fpl); } break; default: if (!vn_seqc_consistent(dvp, dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled_error(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl); /* * The vnode at hand is almost always stable, skip checking for it. * Worst case this postpones the check towards the end of the iteration * of the main loop. */ fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp); mp = atomic_load_ptr(&dvp->v_mount); if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) { return (cache_fpl_aborted(fpl)); } MPASS(fpl->tvp == NULL); for (;;) { cache_fplookup_parse(fpl); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } error = cache_fplookup_next(fpl); if (__predict_false(cache_fpl_terminated(fpl))) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (fpl->tvp->v_type == VLNK) { error = cache_fplookup_symlink(fpl); if (cache_fpl_terminated(fpl)) { break; } } else { if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); } cache_fpl_checkpoint(fpl); } return (error); } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; int error; fpl.status = CACHE_FPL_STATUS_UNSET; fpl.in_smr = false; fpl.ndp = ndp; fpl.cnp = cnp = &ndp->ni_cnd; MPASS(ndp->ni_lcf == 0); KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, ("%s: internal flags found in cn_flags %" PRIx64, __func__, cnp->cn_flags)); MPASS(cnp->cn_nameptr == cnp->cn_pnbuf); MPASS(ndp->ni_resflags == 0); if (__predict_false(!cache_can_fplookup(&fpl))) { *status = fpl.status; SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); return (EOPNOTSUPP); } cache_fpl_checkpoint_outer(&fpl); cache_fpl_smr_enter_initial(&fpl); #ifdef INVARIANTS fpl.debug.ni_pathlen = ndp->ni_pathlen; #endif fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1]; fpl.fsearch = false; fpl.tvp = NULL; /* for degenerate path handling */ fpl.pwd = pwdp; pwd = pwd_get_smr(); *(fpl.pwd) = pwd; namei_setup_rootdir(ndp, cnp, pwd); ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { dvp = cache_fpl_handle_root(&fpl); ndp->ni_resflags = NIRES_ABS; } else { if (ndp->ni_dirfd == AT_FDCWD) { dvp = pwd->pwd_cdir; } else { error = cache_fplookup_dirfd(&fpl, &dvp); if (__predict_false(error != 0)) { goto out; } } } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); out: cache_fpl_smr_assert_not_entered(&fpl); cache_fpl_assert_status(&fpl); *status = fpl.status; if (SDT_PROBES_ENABLED()) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); if (fpl.status == CACHE_FPL_STATUS_HANDLED) SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true, ndp); } if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) { MPASS(error != CACHE_FPL_FAILED); if (error != 0) { cache_fpl_cleanup_cnp(fpl.cnp); MPASS(fpl.dvp == NULL); MPASS(fpl.tvp == NULL); } ndp->ni_dvp = fpl.dvp; ndp->ni_vp = fpl.tvp; } return (error); } diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h index 5fce4b8e1713..f08080d4e4fa 100644 --- a/sys/sys/sysctl.h +++ b/sys/sys/sysctl.h @@ -1,1221 +1,1242 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sysctl.h 8.1 (Berkeley) 6/2/93 */ #ifndef _SYS_SYSCTL_H_ #define _SYS_SYSCTL_H_ #ifdef _KERNEL +#include #include #include #endif /* * Definitions for sysctl call. The sysctl call uses a hierarchical name * for objects that can be examined or modified. The name is expressed as * a sequence of integers. Like a file path name, the meaning of each * component depends on its place in the hierarchy. The top-level and kern * identifiers are defined here, and other identifiers are defined in the * respective subsystem header files. * * Each subsystem defined by sysctl defines a list of variables for that * subsystem. Each name is either a node with further levels defined below it, * or it is a leaf of some particular type given below. Each sysctl level * defines a set of name/type pairs to be used by sysctl(8) in manipulating the * subsystem. */ #define CTL_MAXNAME 24 /* largest number of components supported */ #define CTLTYPE 0xf /* mask for the type */ #define CTLTYPE_NODE 1 /* name is a node */ #define CTLTYPE_INT 2 /* name describes an integer */ #define CTLTYPE_STRING 3 /* name describes a string */ #define CTLTYPE_S64 4 /* name describes a signed 64-bit number */ #define CTLTYPE_OPAQUE 5 /* name describes a structure */ #define CTLTYPE_STRUCT CTLTYPE_OPAQUE /* name describes a structure */ #define CTLTYPE_UINT 6 /* name describes an unsigned integer */ #define CTLTYPE_LONG 7 /* name describes a long */ #define CTLTYPE_ULONG 8 /* name describes an unsigned long */ #define CTLTYPE_U64 9 /* name describes an unsigned 64-bit number */ #define CTLTYPE_U8 0xa /* name describes an unsigned 8-bit number */ #define CTLTYPE_U16 0xb /* name describes an unsigned 16-bit number */ #define CTLTYPE_S8 0xc /* name describes a signed 8-bit number */ #define CTLTYPE_S16 0xd /* name describes a signed 16-bit number */ #define CTLTYPE_S32 0xe /* name describes a signed 32-bit number */ #define CTLTYPE_U32 0xf /* name describes an unsigned 32-bit number */ #define CTLFLAG_RD 0x80000000 /* Allow reads of variable */ #define CTLFLAG_WR 0x40000000 /* Allow writes to the variable */ #define CTLFLAG_RW (CTLFLAG_RD|CTLFLAG_WR) #define CTLFLAG_DORMANT 0x20000000 /* This sysctl is not active yet */ #define CTLFLAG_ANYBODY 0x10000000 /* All users can set this var */ #define CTLFLAG_SECURE 0x08000000 /* Permit set only if securelevel<=0 */ #define CTLFLAG_PRISON 0x04000000 /* Prisoned roots can fiddle */ #define CTLFLAG_DYN 0x02000000 /* Dynamic oid - can be freed */ #define CTLFLAG_SKIP 0x01000000 /* Skip this sysctl when listing */ #define CTLMASK_SECURE 0x00F00000 /* Secure level */ #define CTLFLAG_TUN 0x00080000 /* Default value is loaded from getenv() */ #define CTLFLAG_RDTUN (CTLFLAG_RD|CTLFLAG_TUN) #define CTLFLAG_RWTUN (CTLFLAG_RW|CTLFLAG_TUN) #define CTLFLAG_MPSAFE 0x00040000 /* Handler is MP safe */ #define CTLFLAG_VNET 0x00020000 /* Prisons with vnet can fiddle */ #define CTLFLAG_DYING 0x00010000 /* Oid is being removed */ #define CTLFLAG_CAPRD 0x00008000 /* Can be read in capability mode */ #define CTLFLAG_CAPWR 0x00004000 /* Can be written in capability mode */ #define CTLFLAG_STATS 0x00002000 /* Statistics, not a tuneable */ #define CTLFLAG_NOFETCH 0x00001000 /* Don't fetch tunable from getenv() */ #define CTLFLAG_CAPRW (CTLFLAG_CAPRD|CTLFLAG_CAPWR) /* * This is transient flag to be used until all sysctl handlers are converted * to not lock Giant. * One, and only one of CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT is required * for SYSCTL_PROC and SYSCTL_NODE. */ #define CTLFLAG_NEEDGIANT 0x00000800 /* Handler require Giant */ /* * Secure level. Note that CTLFLAG_SECURE == CTLFLAG_SECURE1. * * Secure when the securelevel is raised to at least N. */ #define CTLSHIFT_SECURE 20 #define CTLFLAG_SECURE1 (CTLFLAG_SECURE | (0 << CTLSHIFT_SECURE)) #define CTLFLAG_SECURE2 (CTLFLAG_SECURE | (1 << CTLSHIFT_SECURE)) #define CTLFLAG_SECURE3 (CTLFLAG_SECURE | (2 << CTLSHIFT_SECURE)) /* * USE THIS instead of a hardwired number from the categories below * to get dynamically assigned sysctl entries using the linker-set * technology. This is the way nearly all new sysctl variables should * be implemented. * e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, ""); */ #define OID_AUTO (-1) /* * The starting number for dynamically-assigned entries. WARNING! * ALL static sysctl entries should have numbers LESS than this! */ #define CTL_AUTO_START 0x100 #ifdef _KERNEL #include #ifdef KLD_MODULE /* XXX allow overspecification of type in external kernel modules */ #define SYSCTL_CT_ASSERT_MASK CTLTYPE #else #define SYSCTL_CT_ASSERT_MASK 0 #endif #define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ intmax_t arg2, struct sysctl_req *req /* definitions for sysctl_req 'lock' member */ #define REQ_UNWIRED 1 #define REQ_WIRED 2 /* definitions for sysctl_req 'flags' member */ #ifdef COMPAT_FREEBSD32 #define SCTL_MASK32 1 /* 32 bit emulation */ #endif /* * This describes the access space for a sysctl request. This is needed * so that we can use the interface from the kernel or from user-space. */ struct thread; struct sysctl_req { struct thread *td; /* used for access checking */ int lock; /* wiring state */ void *oldptr; size_t oldlen; size_t oldidx; int (*oldfunc)(struct sysctl_req *, const void *, size_t); const void *newptr; size_t newlen; size_t newidx; int (*newfunc)(struct sysctl_req *, void *, size_t); size_t validlen; int flags; }; struct sysctl_oid; /* RB Tree handling */ RB_HEAD(sysctl_oid_list, sysctl_oid); /* * This describes one "oid" in the MIB tree. Potentially more nodes can * be hidden behind it, expanded by the handler. */ struct sysctl_oid { struct sysctl_oid_list oid_children; struct sysctl_oid_list* oid_parent; RB_ENTRY(sysctl_oid) oid_link; /* Sort key for all siblings, and lookup key for userland */ int oid_number; u_int oid_kind; void *oid_arg1; intmax_t oid_arg2; /* Must be unique amongst all siblings. */ const char *oid_name; int (*oid_handler)(SYSCTL_HANDLER_ARGS); const char *oid_fmt; int oid_refcnt; u_int oid_running; const char *oid_descr; const char *oid_label; }; static inline int cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b) { if (a->oid_number > b->oid_number) return (1); else if (a->oid_number < b->oid_number) return (-1); else return (0); } RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid); #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) #define SYSCTL_OUT(r, p, l) (r->oldfunc)(r, p, l) #define SYSCTL_OUT_STR(r, p) (r->oldfunc)(r, p, strlen(p) + 1) int sysctl_handle_bool(SYSCTL_HANDLER_ARGS); int sysctl_handle_8(SYSCTL_HANDLER_ARGS); int sysctl_handle_16(SYSCTL_HANDLER_ARGS); int sysctl_handle_32(SYSCTL_HANDLER_ARGS); int sysctl_handle_64(SYSCTL_HANDLER_ARGS); int sysctl_handle_int(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS); int sysctl_handle_long(SYSCTL_HANDLER_ARGS); int sysctl_handle_string(SYSCTL_HANDLER_ARGS); int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS); int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS); int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS); int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS); int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS); int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS); int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS); int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS); /* * These functions are used to add/remove an oid from the mib. */ void sysctl_register_oid(struct sysctl_oid *oidp); void sysctl_register_disabled_oid(struct sysctl_oid *oidp); void sysctl_enable_oid(struct sysctl_oid *oidp); void sysctl_unregister_oid(struct sysctl_oid *oidp); /* Declare a static oid to allow child oids to be added to it. */ #define SYSCTL_DECL(name) \ extern struct sysctl_oid sysctl__##name /* Hide these in macros. */ #define SYSCTL_CHILDREN(oid_ptr) (&(oid_ptr)->oid_children) #define SYSCTL_PARENT(oid_ptr) \ (((oid_ptr)->oid_parent != &sysctl__children) ? \ __containerof((oid_ptr)->oid_parent, struct sysctl_oid, \ oid_children) : (struct sysctl_oid *)NULL) #define SYSCTL_STATIC_CHILDREN(oid_name) (&sysctl__##oid_name.oid_children) /* === Structs and macros related to context handling. === */ /* All dynamically created sysctls can be tracked in a context list. */ struct sysctl_ctx_entry { struct sysctl_oid *entry; TAILQ_ENTRY(sysctl_ctx_entry) link; }; TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry); #define SYSCTL_NODE_CHILDREN(parent, name) \ sysctl__##parent##_##name.oid_children #ifndef NO_SYSCTL_DESCR #define __DESCR(d) d #else #define __DESCR(d) "" #endif #ifdef notyet #define SYSCTL_ENFORCE_FLAGS(x) \ _Static_assert((((x) & CTLFLAG_MPSAFE) != 0) ^ (((x) & CTLFLAG_NEEDGIANT) != 0), \ "Has to be either CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT") #else #define SYSCTL_ENFORCE_FLAGS(x) #endif /* This macro is only for internal use */ #define SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ struct sysctl_oid id = { \ .oid_parent = (parent_child_head), \ .oid_children = RB_INITIALIZER(&id.oid_children), \ .oid_number = (nbr), \ .oid_kind = (kind), \ .oid_arg1 = (a1), \ .oid_arg2 = (a2), \ .oid_name = (name), \ .oid_handler = (handler), \ .oid_fmt = (fmt), \ .oid_descr = __DESCR(descr), \ .oid_label = (label), \ }; \ DATA_SET(sysctl_set, id); \ SYSCTL_ENFORCE_FLAGS(kind) /* This constructs a static "raw" MIB oid. */ #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, \ handler, fmt, descr, NULL) #define SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ static SYSCTL_OID_RAW(sysctl__##parent##_##name, \ SYSCTL_CHILDREN(&sysctl__##parent), \ nbr, #name, kind, a1, a2, handler, fmt, descr, label) /* This constructs a global "raw" MIB oid. */ #define SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \ SYSCTL_OID_RAW(sysctl__##parent##_##name, \ SYSCTL_CHILDREN(&sysctl__##parent), \ nbr, #name, kind, a1, a2, handler, fmt, descr, label) #define SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ ({ \ SYSCTL_ENFORCE_FLAGS(kind); \ sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2,handler, \ fmt, __DESCR(descr), NULL); \ }) /* This constructs a root node from which other nodes can hang. */ #define SYSCTL_ROOT_NODE(nbr, name, access, handler, descr) \ SYSCTL_OID_RAW(sysctl___##name, &sysctl__children, \ nbr, #name, CTLTYPE_NODE|(access), NULL, 0, \ handler, "N", descr, NULL); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) /* This constructs a node from which other oids can hang. */ #define SYSCTL_NODE(parent, nbr, name, access, handler, descr) \ SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, NULL) #define SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, label) \ SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", descr, label); \ SYSCTL_ENFORCE_FLAGS(access); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE) #define SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr) \ SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, \ handler, descr, NULL) #define SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, handler, descr, label) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE); \ SYSCTL_ENFORCE_FLAGS(access); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", __DESCR(descr), label); \ }) #define SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE); \ SYSCTL_ENFORCE_FLAGS(access); \ sysctl_add_oid(ctx, &sysctl__children, nbr, name, \ CTLTYPE_NODE|(access), \ NULL, 0, handler, "N", __DESCR(descr), NULL); \ }) /* Oid for a string. len can be 0 to indicate '\0' termination. */ #define SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), \ arg, len, sysctl_handle_string, "A", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) #define SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \ ({ \ char *__arg = (arg); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), \ __arg, len, sysctl_handle_string, "A", __DESCR(descr), \ NULL); \ }) /* Oid for a constant '\0' terminated string. */ #define SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr) \ SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),\ __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \ CTASSERT(!((access) & CTLFLAG_WR)); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING) #define SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \ ({ \ char *__arg = __DECONST(char *, arg); \ CTASSERT(!((access) & CTLFLAG_WR)); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING); \ sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING | \ CTLFLAG_MPSAFE | (access), __arg, 0, sysctl_handle_string, "A",\ __DESCR(descr), NULL); \ }) /* Oid for a bool. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_BOOL_PTR ((bool *)NULL) #define SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_bool, "CU", descr); \ CTASSERT(((access) & CTLTYPE) == 0 && \ sizeof(bool) == sizeof(*(ptr))) #define SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ bool *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr), \ NULL); \ }) /* Oid for a signed 8-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S8_PTR ((int8_t *)NULL) #define SYSCTL_S8(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_8, "C", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \ sizeof(int8_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int8_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_8, "C", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 8-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U8_PTR ((uint8_t *)NULL) #define SYSCTL_U8(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_8, "CU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \ sizeof(uint8_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint8_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U8 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_8, "CU", __DESCR(descr), NULL); \ }) /* Oid for a signed 16-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S16_PTR ((int16_t *)NULL) #define SYSCTL_S16(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_16, "S", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \ sizeof(int16_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int16_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S16 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_16, "S", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 16-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U16_PTR ((uint16_t *)NULL) #define SYSCTL_U16(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_16, "SU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \ sizeof(uint16_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint16_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U16 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_16, "SU", __DESCR(descr), NULL); \ }) /* Oid for a signed 32-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S32_PTR ((int32_t *)NULL) #define SYSCTL_S32(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_32, "I", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \ sizeof(int32_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int32_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S32 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_32, "I", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 32-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U32_PTR ((uint32_t *)NULL) #define SYSCTL_U32(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_32, "IU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \ sizeof(uint32_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint32_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U32 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_32, "IU", __DESCR(descr), NULL); \ }) /* Oid for a signed 64-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_S64_PTR ((int64_t *)NULL) #define SYSCTL_S64(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "Q", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ sizeof(int64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_64, "Q", __DESCR(descr), NULL); \ }) /* Oid for an unsigned 64-bit int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_U64_PTR ((uint64_t *)NULL) #define SYSCTL_U64(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(uint64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ uint64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_64, "QU", __DESCR(descr), NULL); \ }) /* Oid for an int. If ptr is SYSCTL_NULL_INT_PTR, val is returned. */ #define SYSCTL_NULL_INT_PTR ((int *)NULL) #define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, NULL) #define SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, label) \ SYSCTL_OID_WITH_LABEL(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "I", descr, label); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \ sizeof(int) == sizeof(*(ptr))) #define SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ int *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_int, "I", __DESCR(descr), NULL); \ }) /* Oid for an unsigned int. If ptr is NULL, val is returned. */ #define SYSCTL_NULL_UINT_PTR ((unsigned *)NULL) #define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_int, "IU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \ sizeof(unsigned) == sizeof(*(ptr))) #define SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \ ({ \ unsigned *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ __ptr, val, sysctl_handle_int, "IU", __DESCR(descr), NULL); \ }) /* Oid for a long. The pointer must be non NULL. */ #define SYSCTL_NULL_LONG_PTR ((long *)NULL) #define SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_LONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "L", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \ sizeof(long) == sizeof(*(ptr))) #define SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ long *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_LONG | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_long, "L", __DESCR(descr), NULL); \ }) /* Oid for an unsigned long. The pointer must be non NULL. */ #define SYSCTL_NULL_ULONG_PTR ((unsigned long *)NULL) #define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_long, "LU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) && \ sizeof(unsigned long) == sizeof(*(ptr))) #define SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ unsigned long *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_long, "LU", __DESCR(descr), NULL); \ }) /* Oid for a quad. The pointer must be non NULL. */ #define SYSCTL_NULL_QUAD_PTR ((int64_t *)NULL) #define SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "Q", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \ sizeof(int64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ int64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_64, "Q", __DESCR(descr), NULL); \ }) #define SYSCTL_NULL_UQUAD_PTR ((uint64_t *)NULL) #define SYSCTL_UQUAD(parent, nbr, name, access, ptr, val, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ ptr, val, sysctl_handle_64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(uint64_t) == sizeof(*(ptr))) #define SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uint64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_64, "QU", __DESCR(descr), NULL); \ }) /* Oid for a CPU dependent variable */ #define SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ struct sysctl_oid *__ret; \ CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) || \ sizeof(unsigned) == sizeof(*(ptr))) && \ ((access) & CTLTYPE) == 0); \ if (sizeof(uint64_t) == sizeof(*(ptr))) { \ __ret = sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_64, "QU", \ __DESCR(descr), NULL); \ } else { \ __ret = sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_UINT | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_int, "IU", \ __DESCR(descr), NULL); \ } \ __ret; \ }) /* Oid for a 64-bit unsigned counter(9). The pointer must be non NULL. */ #define SYSCTL_COUNTER_U64(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access), \ (ptr), 0, sysctl_handle_counter_u64, "QU", descr); \ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \ sizeof(counter_u64_t) == sizeof(*(ptr)) && \ sizeof(uint64_t) == sizeof(**(ptr))) #define SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ counter_u64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access), \ __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr), \ NULL); \ }) /* Oid for an array of counter(9)s. The pointer and length must be non zero. */ #define SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access), \ (ptr), (len), sysctl_handle_counter_u64_array, "QU", descr);\ CTASSERT((((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) && \ sizeof(counter_u64_t) == sizeof(*(ptr)) && \ sizeof(uint64_t) == sizeof(**(ptr))) #define SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access, \ ptr, len, descr) \ ({ \ counter_u64_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access), \ __ptr, len, sysctl_handle_counter_u64_array, "S", \ __DESCR(descr), NULL); \ }) /* Oid for an opaque object. Specified by a pointer and a length. */ #define SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ ptr, len, sysctl_handle_opaque, fmt, descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) #define SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr), NULL); \ }) /* Oid for a struct. Specified by a pointer and a type. */ #define SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ ptr, sizeof(struct type), sysctl_handle_opaque, \ "S," #type, descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) #define SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access), \ (ptr), sizeof(struct type), \ sysctl_handle_opaque, "S," #type, __DESCR(descr), NULL); \ }) /* Oid for a procedure. Specified by a pointer and an arg. */ #define SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ SYSCTL_OID(parent, nbr, name, (access), \ ptr, arg, handler, fmt, descr); \ CTASSERT(((access) & CTLTYPE) != 0) #define SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \ ({ \ CTASSERT(((access) & CTLTYPE) != 0); \ SYSCTL_ENFORCE_FLAGS(access); \ sysctl_add_oid(ctx, parent, nbr, name, (access), \ (ptr), (arg), (handler), (fmt), __DESCR(descr), NULL); \ }) /* Oid to handle limits on uma(9) zone specified by pointer. */ #define SYSCTL_UMA_MAX(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ (ptr), 0, sysctl_handle_uma_zone_max, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_UMA_MAX(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uma_zone_t __ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | (access), \ __ptr, 0, sysctl_handle_uma_zone_max, "I", __DESCR(descr), \ NULL); \ }) /* Oid to obtain current use of uma(9) zone specified by pointer. */ #define SYSCTL_UMA_CUR(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_handle_uma_zone_cur, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_UMA_CUR(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ uma_zone_t __ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr), \ NULL); \ }) /* OID expressing a sbintime_t as microseconds */ #define SYSCTL_SBINTIME_USEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_usec_to_sbintime, "Q", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) #define SYSCTL_ADD_SBINTIME_USEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ sbintime_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_usec_to_sbintime, "Q", __DESCR(descr), \ NULL); \ }) /* OID expressing a sbintime_t as milliseconds */ #define SYSCTL_SBINTIME_MSEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_msec_to_sbintime, "Q", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) #define SYSCTL_ADD_SBINTIME_MSEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ sbintime_t *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_msec_to_sbintime, "Q", __DESCR(descr), \ NULL); \ }) /* OID expressing a struct timeval as seconds */ #define SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr) \ SYSCTL_OID(parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ (ptr), 0, sysctl_sec_to_timeval, "I", descr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) #define SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \ ({ \ struct timeval *__ptr = (ptr); \ CTASSERT(((access) & CTLTYPE) == 0 || \ ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT); \ sysctl_add_oid(ctx, parent, nbr, name, \ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access), \ __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr), \ NULL); \ }) #define SYSCTL_FOREACH(oidp, list) \ RB_FOREACH(oidp, sysctl_oid_list, list) /* * A macro to generate a read-only sysctl to indicate the presence of optional * kernel features. */ #define FEATURE(name, desc) \ SYSCTL_INT_WITH_LABEL(_kern_features, OID_AUTO, name, \ CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, desc, "feature") /* Same for the dynamic registration. */ #define FEATURE_ADD(name, desc) \ sysctl_add_oid(NULL, SYSCTL_CHILDREN(&sysctl___kern_features), \ OID_AUTO, name, \ CTLFLAG_RD | CTLFLAG_CAPRD | CTLTYPE_INT | CTLFLAG_MPSAFE, \ NULL, 1, sysctl_handle_int, "I", desc, "feature"); +/* + * Adding new leaves to the 'debug.sizeof' MIB tree for ad-hoc reasons is + * discouraged, and in particular for reporting to developers the size of some + * kernel structures, which can be obtained by the following alternative means: + * 1. In GDB, load a full kernel image and use 'print(sizeof(struct XXX))'. + * Alternatively, use 'ptype/o struct XXX' to additionally get the offsets + * and size of all structure's fields. + * 2. If the structure is allocated from UMA, then 'vmstat -z' reports its size + * (the mapping between structure types and zones is usually + * straightforward). + */ +/* Generates a read-only sysctl reporting the size of an object/structure. */ +#define SYSCTL_SIZEOF(name, expr) \ + SYSCTL_INT(_debug_sizeof, OID_AUTO, name, CTLFLAG_RD, \ + SYSCTL_NULL_INT_PTR, sizeof(expr), \ + "sizeof(" __STRING(expr) ")"); +/* Same, specialized for structures. */ +#define SYSCTL_SIZEOF_STRUCT(struct_name) \ + SYSCTL_SIZEOF(struct_name, struct struct_name) + #endif /* _KERNEL */ /* * Top-level identifiers */ #define CTL_SYSCTL 0 /* "magic" numbers */ #define CTL_KERN 1 /* "high kernel": proc, limits */ #define CTL_VM 2 /* virtual memory */ #define CTL_VFS 3 /* filesystem, mount type is next */ #define CTL_NET 4 /* network, see socket.h */ #define CTL_DEBUG 5 /* debugging parameters */ #define CTL_HW 6 /* generic cpu/io */ #define CTL_MACHDEP 7 /* machine dependent */ #define CTL_USER 8 /* user-level */ #define CTL_P1003_1B 9 /* POSIX 1003.1B */ /* * CTL_SYSCTL identifiers */ #define CTL_SYSCTL_DEBUG 0 /* printf all nodes */ #define CTL_SYSCTL_NAME 1 /* string name of OID */ #define CTL_SYSCTL_NEXT 2 /* next OID, honoring CTLFLAG_SKIP */ #define CTL_SYSCTL_NAME2OID 3 /* int array of name */ #define CTL_SYSCTL_OIDFMT 4 /* OID's kind and format */ #define CTL_SYSCTL_OIDDESCR 5 /* OID's description */ #define CTL_SYSCTL_OIDLABEL 6 /* aggregation label */ #define CTL_SYSCTL_NEXTNOSKIP 7 /* next OID, ignoring CTLFLAG_SKIP */ /* * CTL_KERN identifiers */ #define KERN_OSTYPE 1 /* string: system version */ #define KERN_OSRELEASE 2 /* string: system release */ #define KERN_OSREV 3 /* int: system revision */ #define KERN_VERSION 4 /* string: compile time info */ #define KERN_MAXVNODES 5 /* int: max vnodes */ #define KERN_MAXPROC 6 /* int: max processes */ #define KERN_MAXFILES 7 /* int: max open files */ #define KERN_ARGMAX 8 /* int: max arguments to exec */ #define KERN_SECURELVL 9 /* int: system security level */ #define KERN_HOSTNAME 10 /* string: hostname */ #define KERN_HOSTID 11 /* int: host identifier */ #define KERN_CLOCKRATE 12 /* struct: struct clockrate */ /* was: #define KERN_VNODE 13 ; disabled in 2003 and removed in 2023 */ #define KERN_PROC 14 /* struct: process entries */ #define KERN_FILE 15 /* struct: file entries */ #define KERN_PROF 16 /* node: kernel profiling info */ #define KERN_POSIX1 17 /* int: POSIX.1 version */ #define KERN_NGROUPS 18 /* int: # of supplemental group ids */ #define KERN_JOB_CONTROL 19 /* int: is job control available */ #define KERN_SAVED_IDS 20 /* int: saved set-user/group-ID */ #define KERN_BOOTTIME 21 /* struct: time kernel was booted */ #define KERN_NISDOMAINNAME 22 /* string: YP domain name */ #define KERN_UPDATEINTERVAL 23 /* int: update process sleep time */ #define KERN_OSRELDATE 24 /* int: kernel release date */ #define KERN_NTP_PLL 25 /* node: NTP PLL control */ #define KERN_BOOTFILE 26 /* string: name of booted kernel */ #define KERN_MAXFILESPERPROC 27 /* int: max open files per proc */ #define KERN_MAXPROCPERUID 28 /* int: max processes per uid */ #define KERN_DUMPDEV 29 /* struct cdev *: device to dump on */ #define KERN_IPC 30 /* node: anything related to IPC */ #define KERN_DUMMY 31 /* unused */ #define KERN_PS_STRINGS 32 /* int: address of PS_STRINGS */ #define KERN_USRSTACK 33 /* int: address of USRSTACK */ #define KERN_LOGSIGEXIT 34 /* int: do we log sigexit procs? */ #define KERN_IOV_MAX 35 /* int: value of UIO_MAXIOV */ #define KERN_HOSTUUID 36 /* string: host UUID identifier */ #define KERN_ARND 37 /* int: from arc4rand() */ #define KERN_MAXPHYS 38 /* int: MAXPHYS value */ #define KERN_LOCKF 39 /* struct: lockf reports */ /* * KERN_PROC subtypes */ #define KERN_PROC_ALL 0 /* everything */ #define KERN_PROC_PID 1 /* by process id */ #define KERN_PROC_PGRP 2 /* by process group id */ #define KERN_PROC_SESSION 3 /* by session of pid */ #define KERN_PROC_TTY 4 /* by controlling tty */ #define KERN_PROC_UID 5 /* by effective uid */ #define KERN_PROC_RUID 6 /* by real uid */ #define KERN_PROC_ARGS 7 /* get/set arguments/proctitle */ #define KERN_PROC_PROC 8 /* only return procs */ #define KERN_PROC_SV_NAME 9 /* get syscall vector name */ #define KERN_PROC_RGID 10 /* by real group id */ #define KERN_PROC_GID 11 /* by effective group id */ #define KERN_PROC_PATHNAME 12 /* path to executable */ #define KERN_PROC_OVMMAP 13 /* Old VM map entries for process */ #define KERN_PROC_OFILEDESC 14 /* Old file descriptors for process */ #define KERN_PROC_KSTACK 15 /* Kernel stacks for process */ #define KERN_PROC_INC_THREAD 0x10 /* * modifier for pid, pgrp, tty, * uid, ruid, gid, rgid and proc * This effectively uses 16-31 */ #define KERN_PROC_VMMAP 32 /* VM map entries for process */ #define KERN_PROC_FILEDESC 33 /* File descriptors for process */ #define KERN_PROC_GROUPS 34 /* process groups */ #define KERN_PROC_ENV 35 /* get environment */ #define KERN_PROC_AUXV 36 /* get ELF auxiliary vector */ #define KERN_PROC_RLIMIT 37 /* process resource limits */ #define KERN_PROC_PS_STRINGS 38 /* get ps_strings location */ #define KERN_PROC_UMASK 39 /* process umask */ #define KERN_PROC_OSREL 40 /* osreldate for process binary */ #define KERN_PROC_SIGTRAMP 41 /* signal trampoline location */ #define KERN_PROC_CWD 42 /* process current working directory */ #define KERN_PROC_NFDS 43 /* number of open file descriptors */ #define KERN_PROC_SIGFASTBLK 44 /* address of fastsigblk magic word */ #define KERN_PROC_VM_LAYOUT 45 /* virtual address space layout info */ #define KERN_PROC_RLIMIT_USAGE 46 /* array of rlim_t */ #define KERN_PROC_KQUEUE 47 /* array of struct kinfo_knote */ /* * KERN_IPC identifiers */ #define KIPC_MAXSOCKBUF 1 /* int: max size of a socket buffer */ #define KIPC_SOCKBUF_WASTE 2 /* int: wastage factor in sockbuf */ #define KIPC_SOMAXCONN 3 /* int: max length of connection q */ #define KIPC_MAX_LINKHDR 4 /* int: max length of link header */ #define KIPC_MAX_PROTOHDR 5 /* int: max length of network header */ #define KIPC_MAX_HDR 6 /* int: max total length of headers */ #define KIPC_MAX_DATALEN 7 /* int: max length of data? */ /* * CTL_HW identifiers */ #define HW_MACHINE 1 /* string: machine class */ #define HW_MODEL 2 /* string: specific machine model */ #define HW_NCPU 3 /* int: number of cpus */ #define HW_BYTEORDER 4 /* int: machine byte order */ #define HW_PHYSMEM 5 /* int: total memory */ #define HW_USERMEM 6 /* int: non-kernel memory */ #define HW_PAGESIZE 7 /* int: software page size */ #define HW_DISKNAMES 8 /* strings: disk drive names */ #define HW_DISKSTATS 9 /* struct: diskstats[] */ #define HW_FLOATINGPT 10 /* int: has HW floating point? */ #define HW_MACHINE_ARCH 11 /* string: machine architecture */ #define HW_REALMEM 12 /* int: 'real' memory */ /* * CTL_USER definitions */ #define USER_CS_PATH 1 /* string: _CS_PATH */ #define USER_BC_BASE_MAX 2 /* int: BC_BASE_MAX */ #define USER_BC_DIM_MAX 3 /* int: BC_DIM_MAX */ #define USER_BC_SCALE_MAX 4 /* int: BC_SCALE_MAX */ #define USER_BC_STRING_MAX 5 /* int: BC_STRING_MAX */ #define USER_COLL_WEIGHTS_MAX 6 /* int: COLL_WEIGHTS_MAX */ #define USER_EXPR_NEST_MAX 7 /* int: EXPR_NEST_MAX */ #define USER_LINE_MAX 8 /* int: LINE_MAX */ #define USER_RE_DUP_MAX 9 /* int: RE_DUP_MAX */ #define USER_POSIX2_VERSION 10 /* int: POSIX2_VERSION */ #define USER_POSIX2_C_BIND 11 /* int: POSIX2_C_BIND */ #define USER_POSIX2_C_DEV 12 /* int: POSIX2_C_DEV */ #define USER_POSIX2_CHAR_TERM 13 /* int: POSIX2_CHAR_TERM */ #define USER_POSIX2_FORT_DEV 14 /* int: POSIX2_FORT_DEV */ #define USER_POSIX2_FORT_RUN 15 /* int: POSIX2_FORT_RUN */ #define USER_POSIX2_LOCALEDEF 16 /* int: POSIX2_LOCALEDEF */ #define USER_POSIX2_SW_DEV 17 /* int: POSIX2_SW_DEV */ #define USER_POSIX2_UPE 18 /* int: POSIX2_UPE */ #define USER_STREAM_MAX 19 /* int: POSIX2_STREAM_MAX */ #define USER_TZNAME_MAX 20 /* int: POSIX2_TZNAME_MAX */ #define USER_LOCALBASE 21 /* string: _PATH_LOCALBASE */ #define CTL_P1003_1B_ASYNCHRONOUS_IO 1 /* boolean */ #define CTL_P1003_1B_MAPPED_FILES 2 /* boolean */ #define CTL_P1003_1B_MEMLOCK 3 /* boolean */ #define CTL_P1003_1B_MEMLOCK_RANGE 4 /* boolean */ #define CTL_P1003_1B_MEMORY_PROTECTION 5 /* boolean */ #define CTL_P1003_1B_MESSAGE_PASSING 6 /* boolean */ #define CTL_P1003_1B_PRIORITIZED_IO 7 /* boolean */ #define CTL_P1003_1B_PRIORITY_SCHEDULING 8 /* boolean */ #define CTL_P1003_1B_REALTIME_SIGNALS 9 /* boolean */ #define CTL_P1003_1B_SEMAPHORES 10 /* boolean */ #define CTL_P1003_1B_FSYNC 11 /* boolean */ #define CTL_P1003_1B_SHARED_MEMORY_OBJECTS 12 /* boolean */ #define CTL_P1003_1B_SYNCHRONIZED_IO 13 /* boolean */ #define CTL_P1003_1B_TIMERS 14 /* boolean */ #define CTL_P1003_1B_AIO_LISTIO_MAX 15 /* int */ #define CTL_P1003_1B_AIO_MAX 16 /* int */ #define CTL_P1003_1B_AIO_PRIO_DELTA_MAX 17 /* int */ #define CTL_P1003_1B_DELAYTIMER_MAX 18 /* int */ #define CTL_P1003_1B_MQ_OPEN_MAX 19 /* int */ #define CTL_P1003_1B_PAGESIZE 20 /* int */ #define CTL_P1003_1B_RTSIG_MAX 21 /* int */ #define CTL_P1003_1B_SEM_NSEMS_MAX 22 /* int */ #define CTL_P1003_1B_SEM_VALUE_MAX 23 /* int */ #define CTL_P1003_1B_SIGQUEUE_MAX 24 /* int */ #define CTL_P1003_1B_TIMER_MAX 25 /* int */ #ifdef _KERNEL #define CTL_P1003_1B_MAXID 26 /* * Declare some common oids. */ extern struct sysctl_oid_list sysctl__children; SYSCTL_DECL(_kern); SYSCTL_DECL(_kern_features); SYSCTL_DECL(_kern_ipc); SYSCTL_DECL(_kern_proc); SYSCTL_DECL(_kern_sched); SYSCTL_DECL(_kern_sched_stats); SYSCTL_DECL(_sysctl); SYSCTL_DECL(_vm); SYSCTL_DECL(_vm_stats); SYSCTL_DECL(_vm_stats_misc); SYSCTL_DECL(_vfs); SYSCTL_DECL(_net); SYSCTL_DECL(_debug); SYSCTL_DECL(_debug_sizeof); SYSCTL_DECL(_dev); SYSCTL_DECL(_hw); SYSCTL_DECL(_hw_bus); SYSCTL_DECL(_hw_bus_devices); SYSCTL_DECL(_machdep); SYSCTL_DECL(_machdep_mitigations); SYSCTL_DECL(_user); SYSCTL_DECL(_compat); SYSCTL_DECL(_regression); SYSCTL_DECL(_security); SYSCTL_DECL(_security_bsd); extern const char machine[]; extern const char osrelease[]; extern const char ostype[]; extern const char kern_ident[]; /* Dynamic oid handling */ struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist, struct sysctl_oid_list *parent, int nbr, const char *name, int kind, void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS), const char *fmt, const char *descr, const char *label); int sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del, int recurse); void sysctl_rename_oid(struct sysctl_oid *oidp, const char *name); int sysctl_move_oid(struct sysctl_oid *oidp, struct sysctl_oid_list *parent); int sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse); int sysctl_ctx_init(struct sysctl_ctx_list *clist); int sysctl_ctx_free(struct sysctl_ctx_list *clist); struct sysctl_ctx_entry *sysctl_ctx_entry_add(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); struct sysctl_ctx_entry *sysctl_ctx_entry_find(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); int sysctl_ctx_entry_del(struct sysctl_ctx_list *clist, struct sysctl_oid *oidp); int kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags); int kernel_sysctlbyname(struct thread *td, char *name, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags); int userland_sysctl(struct thread *td, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, const void *new, size_t newlen, size_t *retval, int flags); int sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid, int *nindx, struct sysctl_req *req); void sysctl_wlock(void); void sysctl_wunlock(void); int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len); int kern___sysctlbyname(struct thread *td, const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval, int flags, bool inkernel); struct sbuf; struct sbuf *sbuf_new_for_sysctl(struct sbuf *, char *, int, struct sysctl_req *); #else /* !_KERNEL */ #include #include #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif __BEGIN_DECLS int sysctl(const int *, unsigned int, void *, size_t *, const void *, size_t); int sysctlbyname(const char *, void *, size_t *, const void *, size_t); int sysctlnametomib(const char *, int *, size_t *); __END_DECLS #endif /* _KERNEL */ #endif /* !_SYS_SYSCTL_H_ */