diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
index fea34273baef..61a59be9f78b 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode_os.c
@@ -1,1845 +1,1849 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/mntent.h>
 #include <sys/u8_textprep.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/atomic.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
 #include <sys/zfs_refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Used by fstat(1). */
+#ifdef SYSCTL_SIZEOF
+SYSCTL_SIZEOF(znode, znode_t);
+#else
 SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
 	SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+#endif
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */
 #ifdef	ZFS_DEBUG
 #define	ZNODE_STATS
 #endif	/* DEBUG */
 
 #ifdef	ZNODE_STATS
 #define	ZNODE_STAT_ADD(stat)			((stat)++)
 #else
 #define	ZNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* ZNODE_STATS */
 
 #if !defined(KMEM_DEBUG)
 #define	_ZFS_USE_SMR
 static uma_zone_t znode_uma_zone;
 #else
 static kmem_cache_t *znode_cache = NULL;
 #endif
 
 extern struct vop_vector zfs_vnodeops;
 extern struct vop_vector zfs_fifoops;
 extern struct vop_vector zfs_shareops;
 
 
 /*
  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
  * z_rangelock. It will modify the offset and length of the lock to reflect
  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
  * called with the rangelock_t's rl_lock held, which avoids races.
  */
 static void
 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
 {
 	znode_t *zp = arg;
 
 	/*
 	 * If in append mode, convert to writer and lock starting at the
 	 * current end of file.
 	 */
 	if (new->lr_type == RL_APPEND) {
 		new->lr_offset = zp->z_size;
 		new->lr_type = RL_WRITER;
 	}
 
 	/*
 	 * If we need to grow the block size then lock the whole file range.
 	 */
 	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
 	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
 		new->lr_offset = 0;
 		new->lr_length = UINT64_MAX;
 	}
 }
 
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	znode_t *zp = buf;
 
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 
 	zp->z_acl_cached = NULL;
 	zp->z_xattr_cached = NULL;
 	zp->z_xattr_parent = 0;
 	zp->z_vnode = NULL;
 	zp->z_sync_writes_cnt = 0;
 	zp->z_async_writes_cnt = 0;
 
 	return (0);
 }
 
 static void
 zfs_znode_cache_destructor(void *buf, void *arg)
 {
 	(void) arg;
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	ASSERT3P(zp->z_vnode, ==, NULL);
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	rw_destroy(&zp->z_xattr_lock);
 	zfs_rangelock_fini(&zp->z_rangelock);
 
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	ASSERT3P(zp->z_xattr_cached, ==, NULL);
 
 	ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
 	ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 }
 
 
 #ifdef _ZFS_USE_SMR
 VFS_SMR_DECLARE;
 
 static int
 zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private,
     int flags)
 {
 	return (zfs_znode_cache_constructor(mem, private, flags));
 }
 
 static void
 zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
 {
 	zfs_znode_cache_destructor(mem, private);
 }
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	ASSERT3P(znode_uma_zone, ==, NULL);
 	znode_uma_zone = uma_zcreate("zfs_znode_cache",
 	    sizeof (znode_t), zfs_znode_cache_constructor_smr,
 	    zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
 	VFS_SMR_ZONE_SET(znode_uma_zone);
 }
 
 static znode_t *
 zfs_znode_alloc_kmem(int flags)
 {
 	return (uma_zalloc_smr(znode_uma_zone, flags));
 }
 
 static void
 zfs_znode_free_kmem(znode_t *zp)
 {
 	if (zp->z_xattr_cached) {
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
 	uma_zfree_smr(znode_uma_zone, zp);
 }
 #else
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	ASSERT3P(znode_cache, ==, NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE);
 }
 
 static znode_t *
 zfs_znode_alloc_kmem(int flags)
 {
 	return (kmem_cache_alloc(znode_cache, flags));
 }
 
 static void
 zfs_znode_free_kmem(znode_t *zp)
 {
 	if (zp->z_xattr_cached) {
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
 	kmem_cache_free(znode_cache, zp);
 }
 #endif
 
 void
 zfs_znode_fini(void)
 {
 	/*
 	 * Cleanup zcache
 	 */
 #ifdef _ZFS_USE_SMR
 	if (znode_uma_zone) {
 		uma_zdestroy(znode_uma_zone);
 		znode_uma_zone = NULL;
 	}
 #else
 	if (znode_cache) {
 		kmem_cache_destroy(znode_cache);
 		znode_cache = NULL;
 	}
 #endif
 }
 
 
 static int
 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
 	zfs_acl_ids_t acl_ids;
 	vattr_t vattr;
 	znode_t *sharezp;
 	znode_t *zp;
 	int error;
 
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0555;
 	vattr.va_uid = crgetuid(kcred);
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
 	sharezp->z_unlinked = 0;
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
 	sharezp->z_is_sa = zfsvfs->z_use_sa;
 
 	VERIFY0(zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids, NULL));
 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, sharezp);
 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
 	zfsvfs->z_shares_dir = sharezp->z_id;
 
 	zfs_acl_ids_free(&acl_ids);
 	sa_handle_destroy(sharezp->z_sa_hdl);
 	zfs_znode_free_kmem(sharezp);
 
 	return (error);
 }
 
 /*
  * define a couple of values we need available
  * for both 64 and 32 bit environments.
  */
 #ifndef NBITSMINOR64
 #define	NBITSMINOR64	32
 #endif
 #ifndef MAXMAJ64
 #define	MAXMAJ64	0xffffffffUL
 #endif
 #ifndef	MAXMIN64
 #define	MAXMIN64	0xffffffffUL
 #endif
 
 /*
  * Create special expldev for ZFS private use.
  * Can't use standard expldev since it doesn't do
  * what we want.  The standard expldev() takes a
  * dev32_t in LP64 and expands it to a long dev_t.
  * We need an interface that takes a dev32_t in ILP32
  * and expands it to a long dev_t.
  */
 static uint64_t
 zfs_expldev(dev_t dev)
 {
 	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
 }
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
  * a long dev_t and compresses it to dev32_t in
  * LP64.  We need to do a compaction of a long dev_t
  * to a dev32_t in ILP32.
  */
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
 }
 
 static void
 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
 	ASSERT3P(zp->z_sa_hdl, ==, NULL);
 	ASSERT3P(zp->z_acl_cached, ==, NULL);
 	if (sa_hdl == NULL) {
 		VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
 	} else {
 		zp->z_sa_hdl = sa_hdl;
 		sa_set_userp(sa_hdl, zp);
 	}
 
 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	/*
 	 * Slap on VROOT if we are the root znode unless we are the root
 	 * node of a snapshot mounted under .zfs.
 	 */
 	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
 		ZTOV(zp)->v_flag |= VROOT;
 
 	vn_exists(ZTOV(zp));
 }
 
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
 	    ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs));
 
 	sa_handle_destroy(zp->z_sa_hdl);
 	zp->z_sa_hdl = NULL;
 }
 
 static void
 zfs_vnode_forget(vnode_t *vp)
 {
 
 	/* copied from insmntque_stddtr */
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Construct a new znode/vnode and initialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
     dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
 	znode_t	*zp;
 	vnode_t *vp;
 	uint64_t mode;
 	uint64_t parent;
 #ifdef notyet
 	uint64_t mtime[2], ctime[2];
 #endif
 	uint64_t projid = ZFS_DEFAULT_PROJID;
 	sa_bulk_attr_t bulk[9];
 	int count = 0;
 	int error;
 
 	zp = zfs_znode_alloc_kmem(KM_SLEEP);
 
 #ifndef _ZFS_USE_SMR
 	KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
 	    ("%s: fast path lookup enabled without smr", __func__));
 #endif
 
 	KASSERT(curthread->td_vp_reserved != NULL,
 	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
 	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
 	if (error != 0) {
 		zfs_znode_free_kmem(zp);
 		return (NULL);
 	}
 	zp->z_vnode = vp;
 	vp->v_data = zp;
 
 	/*
 	 * Acquire the vnode lock before any possible interaction with the
 	 * outside world.  Specifically, there is an error path that calls
 	 * zfs_vnode_forget() and the vnode should be exclusively locked.
 	 */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 
 	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_mapcnt = 0;
 	zp->z_id = db->db_object;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 	zp->z_sync_writes_cnt = 0;
 	zp->z_async_writes_cnt = 0;
 	atomic_store_ptr(&zp->z_cached_symlink, NULL);
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, 16);
 #ifdef notyet
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 	    &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 	    &ctime, 16);
 #endif
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, 8);
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
 	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 	    (zp->z_pflags & ZFS_PROJID) &&
 	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 		if (hdl == NULL)
 			sa_handle_destroy(zp->z_sa_hdl);
 		zfs_vnode_forget(vp);
 		zp->z_vnode = NULL;
 		zfs_znode_free_kmem(zp);
 		return (NULL);
 	}
 
 	zp->z_projid = projid;
 	zp->z_mode = mode;
 
 	/* Cache the xattr parent id */
 	if (zp->z_pflags & ZFS_XATTR)
 		zp->z_xattr_parent = parent;
 
 	vp->v_type = IFTOVT((mode_t)mode);
 
 	switch (vp->v_type) {
 	case VDIR:
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
 	case VFIFO:
 		vp->v_op = &zfs_fifoops;
 		break;
 	case VREG:
 		if (parent == zfsvfs->z_shares_dir) {
 			ASSERT0(zp->z_uid);
 			ASSERT0(zp->z_gid);
 			vp->v_op = &zfs_shareops;
 		}
 		break;
 	default:
 			break;
 	}
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	zp->z_zfsvfs = zfsvfs;
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 #if __FreeBSD_version >= 1400077
 	vn_set_state(vp, VSTATE_CONSTRUCTED);
 #endif
 	VN_LOCK_AREC(vp);
 	if (vp->v_type != VFIFO)
 		VN_LOCK_ASHARE(vp);
 
 	return (zp);
 }
 
 static uint64_t empty_xattr;
 static uint64_t pad[4];
 static zfs_acl_phys_t acl_phys;
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *		bonuslen - length of bonus buffer
  *		setaclp  - File/Dir initial ACL
  *		fuidp	 - Tracks fuid allocation.
  *
  *	OUT:	zpp	- allocated znode
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
 	uint64_t	mode, size, links, parent, pflags;
 	uint64_t	dzp_pflags = 0;
 	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	dmu_buf_t	*db;
 	timestruc_t	now;
 	uint64_t	gen, obj;
 	int		bonuslen;
 	int		dnodesize;
 	sa_handle_t	*sa_hdl;
 	dmu_object_type_t obj_type;
 	sa_bulk_attr_t	*sa_attrs;
 	int		cnt = 0;
 	zfs_acl_locator_cb_t locate = { 0 };
 
 	ASSERT3P(vap, !=, NULL);
 	ASSERT3U((vap->va_mask & AT_MODE), ==, AT_MODE);
 
 	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 		dnodesize = vap->va_fsid;	/* ditto */
 	} else {
 		obj = 0;
 		vfs_timestamp(&now);
 		gen = dmu_tx_get_txg(tx);
 		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 	}
 
 	if (dnodesize == 0)
 		dnodesize = DNODE_MIN_SIZE;
 
 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 	bonuslen = (obj_type == DMU_OT_SA) ?
 	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (zfsvfs->z_replay) {
 			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, dnodesize, tx));
 		} else {
 			obj = zap_create_norm_dnsize(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, dnodesize, tx);
 		}
 	} else {
 		if (zfsvfs->z_replay) {
 			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, dnodesize, tx));
 		} else {
 			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, dnodesize, tx);
 		}
 	}
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_id = obj;
 	} else {
 		dzp_pflags = dzp->z_pflags;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp_pflags & ZFS_XATTR) {
 		flag |= IS_XATTR;
 	}
 
 	if (zfsvfs->z_use_fuids)
 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 	else
 		pflags = 0;
 
 	if (vap->va_type == VDIR) {
 		size = 2;		/* contents ("." and "..") */
 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
 	} else {
 		size = links = 0;
 	}
 
 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
 		rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	parent = dzp->z_id;
 	mode = acl_ids->z_mode;
 	if (flag & IS_XATTR)
 		pflags |= ZFS_XATTR;
 
 	/*
 	 * No execs denied will be determined when zfs_mode_compute() is called.
 	 */
 	pflags |= acl_ids->z_aclp->z_hints &
 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
 	ZFS_TIME_ENCODE(&now, crtime);
 	ZFS_TIME_ENCODE(&now, ctime);
 
 	if (vap->va_mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, mtime);
 	}
 
 	/* Now add in all of the "SA" attributes */
 	VERIFY0(sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 	    &sa_hdl));
 
 	/*
 	 * Setup the array of attributes to be replaced/set on the new file
 	 *
 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 	 * in the old znode_phys_t format.  Don't change this ordering
 	 */
 	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 	} else {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 		    NULL, &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 		    NULL, &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 	}
 
 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &empty_xattr, 8);
 	}
 	if (obj_type == DMU_OT_ZNODE ||
 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 		    NULL, &rdev, 8);
 
 	}
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 		    sizeof (uint64_t) * 4);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &acl_phys, sizeof (zfs_acl_phys_t));
 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &acl_ids->z_aclp->z_acl_count, 8);
 		locate.cb_aclp = acl_ids->z_aclp;
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    acl_ids->z_aclp->z_acl_bytes);
 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 		    acl_ids->z_fuid, acl_ids->z_fgid);
 	}
 
 	VERIFY0(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx));
 
 	if (!(flag & IS_ROOT_NODE)) {
 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 		ASSERT3P(*zpp, !=, NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
 		 * passed in is the znode for the root.
 		 */
 		*zpp = dzp;
 
 		(*zpp)->z_sa_hdl = sa_hdl;
 	}
 
 	(*zpp)->z_pflags = pflags;
 	(*zpp)->z_mode = mode;
 	(*zpp)->z_dnodesize = dnodesize;
 
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
 	if (obj_type == DMU_OT_ZNODE ||
 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	if (!(flag & IS_ROOT_NODE)) {
 		vnode_t *vp = ZTOV(*zpp);
 		vp->v_vflag |= VV_FORCEINSMQ;
 		int err = insmntque(vp, zfsvfs->z_vfs);
 		vp->v_vflag &= ~VV_FORCEINSMQ;
 		(void) err;
 		KASSERT(err == 0, ("insmntque() failed: error %d", err));
 	}
 	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
 /*
  * Update in-core attributes.  It is assumed the caller will be doing an
  * sa_bulk_update to push the changes out.
  */
 void
 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
 	xoptattr_t *xoap;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT3P(xoap, !=, NULL);
 
 	if (zp->z_zfsvfs->z_replay == B_FALSE) {
 		ASSERT_VOP_IN_SEQC(ZTOV(zp));
 	}
 
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 		uint64_t times[2];
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 		    &times, sizeof (times), tx);
 		XVA_SET_RTN(xvap, XAT_CREATETIME);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_READONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_HIDDEN);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SYSTEM);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NODUMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OPAQUE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		zfs_sa_set_scanstamp(zp, xvap, tx);
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_REPARSE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OFFLINE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SPARSE);
 	}
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	vnode_t		*vp;
 	sa_handle_t	*hdl;
 	int locked;
 	int err;
 
 	getnewvnode_reserve();
 again:
 	*zpp = NULL;
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		getnewvnode_drop_reserve();
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		getnewvnode_drop_reserve();
 		return (SET_ERROR(EINVAL));
 	}
 
 	hdl = dmu_buf_get_user(db);
 	if (hdl != NULL) {
 		zp = sa_get_userdata(hdl);
 
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
 		ASSERT3P(zp, !=, NULL);
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			vp = ZTOV(zp);
 			/*
 			 * Don't let the vnode disappear after
 			 * ZFS_OBJ_HOLD_EXIT.
 			 */
 			VN_HOLD(vp);
 			*zpp = zp;
 			err = 0;
 		}
 
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
 		if (err) {
 			getnewvnode_drop_reserve();
 			return (err);
 		}
 
 		locked = VOP_ISLOCKED(vp);
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
 			/*
 			 * The vnode is doomed and this thread doesn't
 			 * hold the exclusive lock on it, so the vnode
 			 * must be being reclaimed by another thread.
 			 * Otherwise the doomed vnode is being reclaimed
 			 * by this thread and zfs_zget is called from
 			 * ZIL internals.
 			 */
 			VI_UNLOCK(vp);
 
 			/*
 			 * XXX vrele() locks the vnode when the last reference
 			 * is dropped.  Although in this case the vnode is
 			 * doomed / dead and so no inactivation is required,
 			 * the vnode lock is still acquired.  That could result
 			 * in a LOR with z_teardown_lock if another thread holds
 			 * the vnode's lock and tries to take z_teardown_lock.
 			 * But that is only possible if the other thread peforms
 			 * a ZFS vnode operation on the vnode.  That either
 			 * should not happen if the vnode is dead or the thread
 			 * should also have a reference to the vnode and thus
 			 * our reference is not last.
 			 */
 			VN_RELE(vp);
 			goto again;
 		}
 		VI_UNLOCK(vp);
 		getnewvnode_drop_reserve();
 		return (err);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 * but only if file exists.
 	 *
 	 * There is a small window where zfs_vget() could
 	 * find this object while a file create is still in
 	 * progress.  This is checked for in zfs_znode_alloc()
 	 *
 	 * if zfs_znode_alloc() fails it will drop the hold on the
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
 	    doi.doi_bonus_type, NULL);
 	if (zp == NULL) {
 		err = SET_ERROR(ENOENT);
 	} else {
 		*zpp = zp;
 	}
 	if (err == 0) {
 		vnode_t *vp = ZTOV(zp);
 
 		err = insmntque(vp, zfsvfs->z_vfs);
 		if (err == 0) {
 			vp->v_hash = obj_num;
 			VOP_UNLOCK(vp);
 		} else {
 			zp->z_vnode = NULL;
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
 			*zpp = NULL;
 		}
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	getnewvnode_drop_reserve();
 	return (err);
 }
 
 int
 zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	vnode_t *vp;
 	uint64_t obj_num = zp->z_id;
 	uint64_t mode, size;
 	sa_bulk_attr_t bulk[8];
 	int err;
 	int count = 0;
 	uint64_t gen;
 
 	/*
 	 * Remove cached pages before reloading the znode, so that they are not
 	 * lingering after we run into any error.  Ideally, we should vgone()
 	 * the vnode in case of error, but currently we cannot do that
 	 * because of the LOR between the vnode lock and z_teardown_lock.
 	 * So, instead, we have to "doom" the znode in the illumos style.
 	 *
 	 * Ignore invalid pages during the scan.  This is to avoid deadlocks
 	 * between page busying and the teardown lock, as pages are busied prior
 	 * to a VOP_GETPAGES operation, which acquires the teardown read lock.
 	 * Such pages will be invalid and can safely be skipped here.
 	 */
 	vp = ZTOV(zp);
 #if __FreeBSD_version >= 1400042
 	vn_pages_remove_valid(vp, 0, 0);
 #else
 	vn_pages_remove(vp, 0, 0);
 #endif
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	mutex_enter(&zp->z_acl_lock);
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 	mutex_exit(&zp->z_acl_lock);
 
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	if (zp->z_xattr_cached) {
 		nvlist_free(zp->z_xattr_cached);
 		zp->z_xattr_cached = NULL;
 	}
 	rw_exit(&zp->z_xattr_lock);
 
 	ASSERT3P(zp->z_sa_hdl, ==, NULL);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
 	size = zp->z_size;
 
 	/* reload cached values */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 	    &gen, sizeof (gen));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, sizeof (zp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, sizeof (zp->z_links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, sizeof (zp->z_atime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, sizeof (zp->z_uid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, sizeof (zp->z_gid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 	    &mode, sizeof (mode));
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	zp->z_mode = mode;
 
 	if (gen != zp->z_gen) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * It is highly improbable but still quite possible that two
 	 * objects in different datasets are created with the same
 	 * object numbers and in transaction groups with the same
 	 * numbers.  znodes corresponding to those objects would
 	 * have the same z_id and z_gen, but their other attributes
 	 * may be different.
 	 * zfs recv -F may replace one of such objects with the other.
 	 * As a result file properties recorded in the replaced
 	 * object's vnode may no longer match the received object's
 	 * properties.  At present the only cached property is the
 	 * files type recorded in v_type.
 	 * So, handle this case by leaving the old vnode and znode
 	 * disassociated from the actual object.  A new vnode and a
 	 * znode will be created if the object is accessed
 	 * (e.g. via a look-up).  The old vnode and znode will be
 	 * recycled when the last vnode reference is dropped.
 	 */
 	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * If the file has zero links, then it has been unlinked on the send
 	 * side and it must be in the received unlinked set.
 	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
 	 * stale data and to prevent automatically removal of the file in
 	 * zfs_zinactive().  The file will be removed either when it is removed
 	 * on the send side and the next incremental stream is received or
 	 * when the unlinked set gets processed.
 	 */
 	zp->z_unlinked = (zp->z_links == 0);
 	if (zp->z_unlinked) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (0);
 	}
 
 	zp->z_blksz = doi.doi_data_block_size;
 	if (zp->z_size != size)
 		vnode_pager_setsize(vp, zp->z_size);
 
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
 	uint64_t acl_obj = zfs_external_acl(zp);
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	if (acl_obj) {
 		VERIFY(!zp->z_is_sa);
 		VERIFY0(dmu_object_free(os, acl_obj, tx));
 	}
 	VERIFY0(dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
 	ASSERT3P(zp->z_sa_hdl, !=, NULL);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
 	/*
 	 * If this was the last reference to a file with no links, remove
 	 * the file from the file system unless the file system is mounted
 	 * read-only.  That can happen, for example, if the file system was
 	 * originally read-write, the file was opened, then unlinked and
 	 * the file system was made read-only before the file was finally
 	 * closed.  The file will remain in the unlinked set.
 	 */
 	if (zp->z_unlinked) {
 		ASSERT(!zfsvfs->z_issnap);
 		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
 			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 			zfs_rmnode(zp);
 			return;
 		}
 	}
 
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
 }
 
 void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	char *symlink;
 
 	ASSERT3P(zp->z_sa_hdl, ==, NULL);
 	zp->z_vnode = NULL;
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	symlink = atomic_load_ptr(&zp->z_cached_symlink);
 	if (symlink != NULL) {
 		atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
 		    (uintptr_t)NULL);
 		cache_symlink_free(symlink, strlen(symlink) + 1);
 	}
 
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	zfs_znode_free_kmem(zp);
 }
 
 void
 zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2],
     uint64_t ctime[2], boolean_t have_tx)
 {
 	timestruc_t	now;
 
 	vfs_timestamp(&now);
 
 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
 	if (flag & AT_ATIME) {
 		ZFS_TIME_ENCODE(&now, zp->z_atime);
 	}
 
 	if (flag & AT_MTIME) {
 		ZFS_TIME_ENCODE(&now, mtime);
 		if (zp->z_zfsvfs->z_use_fuids) {
 			zp->z_pflags |= (ZFS_ARCHIVE |
 			    ZFS_AV_MODIFIED);
 		}
 	}
 
 	if (flag & AT_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
 		if (zp->z_zfsvfs->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
 }
 
 
 void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
     uint64_t ctime[2])
 {
 	zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE);
 }
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
 
 	if (error == ENOTSUP)
 		return;
 	ASSERT0(error);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
 /*
  * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t *tx;
 	zfs_locked_range_t *lr;
 	uint64_t newblksz;
 	int error;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			/*
 			 * File's blocksize is already larger than the
 			 * "recordsize" property.  Only let it grow to
 			 * the next power of 2.
 			 */
 			ASSERT(!ISP2(zp->z_blksz));
 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
 	} else {
 		newblksz = 0;
 	}
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 
 	if (newblksz)
 		zfs_grow_blocksize(zp, newblksz, tx);
 
 	zp->z_size = end;
 
 	VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
 	vnode_pager_setsize(ZTOV(zp), end);
 
 	zfs_rangelock_exit(lr);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zfs_locked_range_t *lr;
 	int error;
 
 	/*
 	 * Lock the range being freed.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 
 	if (off + len > zp->z_size)
 		len = zp->z_size - off;
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
 	if (error == 0) {
 #if __FreeBSD_version >= 1400032
 		vnode_pager_purge_range(ZTOV(zp), off, off + len);
 #else
 		/*
 		 * Before __FreeBSD_version 1400032 we cannot free block in the
 		 * middle of a file, but only at the end of a file, so this code
 		 * path should never happen.
 		 */
 		vnode_pager_setsize(ZTOV(zp), off);
 #endif
 	}
 
 	zfs_rangelock_exit(lr);
 
 	return (error);
 }
 
 /*
  * Truncate a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	zfs_locked_range_t *lr;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
 		zfs_rangelock_exit(lr);
 		return (0);
 	}
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
 	    DMU_OBJECT_END);
 	if (error) {
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_rangelock_exit(lr);
 		return (error);
 	}
 
 	zp->z_size = end;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &zp->z_size, sizeof (zp->z_size));
 
 	if (end == 0) {
 		zp->z_pflags &= ~ZFS_SPARSE;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, 8);
 	}
 	VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
 	vnode_pager_setsize(vp, end);
 
 	zfs_rangelock_exit(lr);
 
 	return (0);
 }
 
 /*
  * Free space in a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of range
  *		len	- end of range (0 => EOF)
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
  *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
 	uint64_t mode;
 	uint64_t mtime[2], ctime[2];
 	sa_bulk_attr_t bulk[3];
 	int count = 0;
 	int error;
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
 	    sizeof (mode))) != 0)
 		return (error);
 
 	if (off > zp->z_size) {
 		error =  zfs_extend(zp, off+len);
 		if (error == 0 && log)
 			goto log;
 		else
 			return (error);
 	}
 
 	if (len == 0) {
 		error = zfs_trunc(zp, off);
 	} else {
 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
 		    off + len > zp->z_size)
 			error = zfs_extend(zp, off+len);
 	}
 	if (error || !log)
 		return (error);
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &zp->z_pflags, 8);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
 	zfsvfs_t	*zfsvfs;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT0(error);
 
 	/*
 	 * Set starting attributes.
 	 */
 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
 		uint64_t val;
 		const char *name;
 
 		ASSERT3S(nvpair_type(elem), ==, DATA_TYPE_UINT64);
 		val = fnvpair_value_uint64(elem);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
 			if (val < version)
 				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
 		ASSERT0(error);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
 			norm = val;
 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
 			sense = val;
 	}
 	ASSERT3U(version, !=, 0);
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 	ASSERT0(error);
 
 	/*
 	 * Create zap object used for SA attribute registration
 	 */
 
 	if (version >= ZPL_VERSION_SA) {
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 	} else {
 		sa_obj = 0;
 	}
 	/*
 	 * Create a delete queue.
 	 */
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT0(error);
 
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
 	VATTR_NULL(&vattr);
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
 
 	zfsvfs->z_os = os;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_version = version;
 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
 	zfsvfs->z_use_sa = USE_SA(version, os);
 	zfsvfs->z_norm = norm;
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 
 	ASSERT0(error);
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	rootzp->z_zfsvfs = zfsvfs;
 	VERIFY0(zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids, NULL));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT0(error);
 	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	sa_handle_destroy(rootzp->z_sa_hdl);
 	zfs_znode_free_kmem(rootzp);
 
 	/*
 	 * Create shares directory
 	 */
 
 	error = zfs_create_share_dir(zfsvfs, tx);
 
 	ASSERT0(error);
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 void
 zfs_znode_update_vfs(znode_t *zp)
 {
 	vm_object_t object;
 
 	if ((object = ZTOV(zp)->v_object) == NULL ||
 	    zp->z_size == object->un_pager.vnp.vnp_size)
 		return;
 
 	vnode_pager_setsize(ZTOV(zp), zp->z_size);
 }
 
 int
 zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t parent;
 	int is_xattrdir;
 	int err;
 
 	/* Extended attributes should not be visible as regular files. */
 	if ((zp->z_pflags & ZFS_XATTR) != 0)
 		return (SET_ERROR(EINVAL));
 
 	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
 	    &parent, &is_xattrdir);
 	if (err != 0)
 		return (err);
 	ASSERT0(is_xattrdir);
 
 	/* No name as this is a root object. */
 	if (parent == zp->z_id)
 		return (SET_ERROR(EINVAL));
 
 	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
 	    ZFS_DIRENT_OBJ(-1ULL), buf);
 	if (err != 0)
 		return (err);
 	err = zfs_zget(zfsvfs, parent, dzpp);
 	return (err);
 }
 
 int
 zfs_rlimit_fsize(off_t fsize)
 {
 	struct thread *td = curthread;
 	off_t lim;
 
 	if (td == NULL)
 		return (0);
 
 	lim = lim_cur(td, RLIMIT_FSIZE);
 	if (__predict_true((uoff_t)fsize <= lim))
 		return (0);
 
 	/*
 	 * The limit is reached.
 	 */
 	PROC_LOCK(td->td_proc);
 	kern_psignal(td->td_proc, SIGXFSZ);
 	PROC_UNLOCK(td->td_proc);
 
 	return (EFBIG);
 }
diff --git a/sys/fs/devfs/devfs_devs.c b/sys/fs/devfs/devfs_devs.c
index c6dcd4fc7646..124f9f0449af 100644
--- a/sys/fs/devfs/devfs_devs.c
+++ b/sys/fs/devfs/devfs_devs.c
@@ -1,754 +1,751 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2000,2004
  *	Poul-Henning Kamp.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vfsops.c 1.36
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <sys/kdb.h>
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * The one true (but secret) list of active devices in the system.
  * Locked by dev_lock()/devmtx
  */
 struct cdev_priv_list cdevp_list = TAILQ_HEAD_INITIALIZER(cdevp_list);
 
 struct unrhdr *devfs_inos;
 
 static MALLOC_DEFINE(M_DEVFS2, "DEVFS2", "DEVFS data 2");
 static MALLOC_DEFINE(M_DEVFS3, "DEVFS3", "DEVFS data 3");
 static MALLOC_DEFINE(M_CDEVP, "DEVFS1", "DEVFS cdev_priv storage");
 
 SYSCTL_NODE(_vfs, OID_AUTO, devfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "DEVFS filesystem");
 
 static unsigned devfs_generation;
 SYSCTL_UINT(_vfs_devfs, OID_AUTO, generation, CTLFLAG_RD,
 	&devfs_generation, 0, "DEVFS generation number");
 
 unsigned devfs_rule_depth = 1;
 SYSCTL_UINT(_vfs_devfs, OID_AUTO, rule_depth, CTLFLAG_RW,
 	&devfs_rule_depth, 0, "Max depth of ruleset include");
 
 /*
  * Helper sysctl for devname(3).  We're given a dev_t and return the
  * name, if any, registered by the device driver.
  */
 static int
 sysctl_devname(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	dev_t ud;
 #ifdef COMPAT_FREEBSD11
 	uint32_t ud_compat;
 #endif
 	struct cdev_priv *cdp;
 	struct cdev *dev;
 
 	if (req->newptr == NULL)
 		return (EINVAL);
 
 #ifdef COMPAT_FREEBSD11
 	if (req->newlen == sizeof(ud_compat)) {
 		error = SYSCTL_IN(req, &ud_compat, sizeof(ud_compat));
 		if (error == 0)
 			ud = ud_compat == (uint32_t)NODEV ? NODEV : ud_compat;
 	} else
 #endif
 		error = SYSCTL_IN(req, &ud, sizeof (ud));
 	if (error)
 		return (error);
 	if (ud == NODEV)
 		return (EINVAL);
 	dev = NULL;
 	dev_lock();
 	TAILQ_FOREACH(cdp, &cdevp_list, cdp_list)
 		if (cdp->cdp_inode == ud) {
 			dev = &cdp->cdp_c;
 			dev_refl(dev);
 			break;
 		}
 	dev_unlock();
 	if (dev == NULL)
 		return (ENOENT);
 	error = SYSCTL_OUT(req, dev->si_name, strlen(dev->si_name) + 1);
 	dev_rel(dev);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, devname,
     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MPSAFE,
     NULL, 0, sysctl_devname, "", "devname(3) handler");
 
-SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct cdev), "sizeof(struct cdev)");
-
-SYSCTL_INT(_debug_sizeof, OID_AUTO, cdev_priv, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct cdev_priv), "sizeof(struct cdev_priv)");
+SYSCTL_SIZEOF_STRUCT(cdev);
+SYSCTL_SIZEOF_STRUCT(cdev_priv);
 
 struct cdev *
 devfs_alloc(int flags)
 {
 	struct cdev_priv *cdp;
 	struct cdev *cdev;
 	struct timespec ts;
 
 	cdp = malloc(sizeof *cdp, M_CDEVP, M_ZERO |
 	    ((flags & MAKEDEV_NOWAIT) ? M_NOWAIT : M_WAITOK));
 	if (cdp == NULL)
 		return (NULL);
 
 	mtx_init(&cdp->cdp_threadlock, "devthrd", NULL, MTX_DEF);
 
 	cdp->cdp_dirents = &cdp->cdp_dirent0;
 
 	cdev = &cdp->cdp_c;
 	LIST_INIT(&cdev->si_children);
 	vfs_timestamp(&ts);
 	cdev->si_atime = cdev->si_mtime = cdev->si_ctime = ts;
 
 	return (cdev);
 }
 
 int
 devfs_dev_exists(const char *name)
 {
 	struct cdev_priv *cdp;
 
 	dev_lock_assert_locked();
 
 	TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
 		if ((cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (devfs_pathpath(cdp->cdp_c.si_name, name) != 0)
 			return (1);
 		if (devfs_pathpath(name, cdp->cdp_c.si_name) != 0)
 			return (1);
 	}
 	if (devfs_dir_find(name) != 0)
 		return (1);
 
 	return (0);
 }
 
 void
 devfs_free(struct cdev *cdev)
 {
 	struct cdev_priv *cdp;
 
 	cdp = cdev2priv(cdev);
 	KASSERT((cdp->cdp_flags & (CDP_ACTIVE | CDP_ON_ACTIVE_LIST)) == 0,
 	    ("%s: cdp %p (%s) still on active list",
 	    __func__, cdp, cdev->si_name));
 	if (cdev->si_cred != NULL)
 		crfree(cdev->si_cred);
 	devfs_free_cdp_inode(cdp->cdp_inode);
 	if (cdp->cdp_maxdirent > 0) 
 		free(cdp->cdp_dirents, M_DEVFS2);
 	mtx_destroy(&cdp->cdp_threadlock);
 	free(cdp, M_CDEVP);
 }
 
 struct devfs_dirent *
 devfs_find(struct devfs_dirent *dd, const char *name, int namelen, int type)
 {
 	struct devfs_dirent *de;
 
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (namelen != de->de_dirent->d_namlen)
 			continue;
 		if (type != 0 && type != de->de_dirent->d_type)
 			continue;
 
 		/*
 		 * The race with finding non-active name is not
 		 * completely closed by the check, but it is similar
 		 * to the devfs_allocv() in making it unlikely enough.
 		 */
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 
 		if (bcmp(name, de->de_dirent->d_name, namelen) != 0)
 			continue;
 		break;
 	}
 	KASSERT(de == NULL || (de->de_flags & DE_DOOMED) == 0,
 	    ("devfs_find: returning a doomed entry"));
 	return (de);
 }
 
 struct devfs_dirent *
 devfs_newdirent(char *name, int namelen)
 {
 	int i;
 	struct devfs_dirent *de;
 	struct dirent d;
 
 	d.d_namlen = namelen;
 	i = sizeof(*de) + GENERIC_DIRSIZ(&d);
 	de = malloc(i, M_DEVFS3, M_WAITOK | M_ZERO);
 	de->de_dirent = (struct dirent *)(de + 1);
 	de->de_dirent->d_namlen = namelen;
 	de->de_dirent->d_reclen = GENERIC_DIRSIZ(&d);
 	bcopy(name, de->de_dirent->d_name, namelen);
 	dirent_terminate(de->de_dirent);
 	vfs_timestamp(&de->de_ctime);
 	de->de_mtime = de->de_atime = de->de_ctime;
 	de->de_links = 1;
 	de->de_holdcnt = 1;
 #ifdef MAC
 	mac_devfs_init(de);
 #endif
 	return (de);
 }
 
 struct devfs_dirent *
 devfs_parent_dirent(struct devfs_dirent *de)
 {
 
 	if (de->de_dirent->d_type != DT_DIR)
 		return (de->de_dir);
 
 	if (de->de_flags & (DE_DOT | DE_DOTDOT))
 		return (NULL);
 
 	de = TAILQ_FIRST(&de->de_dlist);	/* "." */
 	if (de == NULL)
 		return (NULL);
 	de = TAILQ_NEXT(de, de_list);		/* ".." */
 	if (de == NULL)
 		return (NULL);
 
 	return (de->de_dir);
 }
 
 struct devfs_dirent *
 devfs_vmkdir(struct devfs_mount *dmp, char *name, int namelen,
     struct devfs_dirent *dotdot, u_int inode)
 {
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 
 	/* Create the new directory */
 	dd = devfs_newdirent(name, namelen);
 	TAILQ_INIT(&dd->de_dlist);
 	dd->de_dirent->d_type = DT_DIR;
 	dd->de_mode = 0555;
 	dd->de_links = 2;
 	dd->de_dir = dd;
 	if (inode != 0)
 		dd->de_inode = inode;
 	else
 		dd->de_inode = alloc_unr(devfs_inos);
 
 	/*
 	 * "." and ".." are always the two first entries in the
 	 * de_dlist list.
 	 *
 	 * Create the "." entry in the new directory.
 	 */
 	de = devfs_newdirent(".", 1);
 	de->de_dirent->d_type = DT_DIR;
 	de->de_flags |= DE_DOT;
 	TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 	de->de_dir = dd;
 
 	/* Create the ".." entry in the new directory. */
 	de = devfs_newdirent("..", 2);
 	de->de_dirent->d_type = DT_DIR;
 	de->de_flags |= DE_DOTDOT;
 	TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 	if (dotdot == NULL) {
 		de->de_dir = dd;
 	} else {
 		de->de_dir = dotdot;
 		sx_assert(&dmp->dm_lock, SX_XLOCKED);
 		TAILQ_INSERT_TAIL(&dotdot->de_dlist, dd, de_list);
 		dotdot->de_links++;
 		devfs_rules_apply(dmp, dd);
 	}
 
 #ifdef MAC
 	mac_devfs_create_directory(dmp->dm_mount, name, namelen, dd);
 #endif
 	return (dd);
 }
 
 void
 devfs_dirent_free(struct devfs_dirent *de)
 {
 	struct vnode *vp;
 
 	vp = de->de_vnode;
 	mtx_lock(&devfs_de_interlock);
 	if (vp != NULL && vp->v_data == de)
 		vp->v_data = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	free(de, M_DEVFS3);
 }
 
 /*
  * Removes a directory if it is empty. Also empty parent directories are
  * removed recursively.
  */
 static void
 devfs_rmdir_empty(struct devfs_mount *dm, struct devfs_dirent *de)
 {
 	struct devfs_dirent *dd, *de_dot, *de_dotdot;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 
 	for (;;) {
 		KASSERT(de->de_dirent->d_type == DT_DIR,
 		    ("devfs_rmdir_empty: de is not a directory"));
 
 		if ((de->de_flags & DE_DOOMED) != 0 || de == dm->dm_rootdir)
 			return;
 
 		de_dot = TAILQ_FIRST(&de->de_dlist);
 		KASSERT(de_dot != NULL, ("devfs_rmdir_empty: . missing"));
 		de_dotdot = TAILQ_NEXT(de_dot, de_list);
 		KASSERT(de_dotdot != NULL, ("devfs_rmdir_empty: .. missing"));
 		/* Return if the directory is not empty. */
 		if (TAILQ_NEXT(de_dotdot, de_list) != NULL)
 			return;
 
 		dd = devfs_parent_dirent(de);
 		KASSERT(dd != NULL, ("devfs_rmdir_empty: NULL dd"));
 		TAILQ_REMOVE(&de->de_dlist, de_dot, de_list);
 		TAILQ_REMOVE(&de->de_dlist, de_dotdot, de_list);
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		DEVFS_DE_HOLD(dd);
 		devfs_delete(dm, de, DEVFS_DEL_NORECURSE);
 		devfs_delete(dm, de_dot, DEVFS_DEL_NORECURSE);
 		devfs_delete(dm, de_dotdot, DEVFS_DEL_NORECURSE);
 		if (DEVFS_DE_DROP(dd)) {
 			devfs_dirent_free(dd);
 			return;
 		}
 
 		de = dd;
 	}
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call since
  * dm->dm_lock may be temporary dropped.
  */
 void
 devfs_delete(struct devfs_mount *dm, struct devfs_dirent *de, int flags)
 {
 	struct devfs_dirent *dd;
 	struct vnode *vp;
 
 	KASSERT((de->de_flags & DE_DOOMED) == 0,
 		("devfs_delete doomed dirent"));
 	de->de_flags |= DE_DOOMED;
 
 	if ((flags & DEVFS_DEL_NORECURSE) == 0) {
 		dd = devfs_parent_dirent(de);
 		if (dd != NULL)
 			DEVFS_DE_HOLD(dd);
 		if (de->de_flags & DE_USER) {
 			KASSERT(dd != NULL, ("devfs_delete: NULL dd"));
 			devfs_dir_unref_de(dm, dd);
 		}
 	} else
 		dd = NULL;
 
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		vhold(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_unlock(&dm->dm_lock);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		VOP_UNLOCK(vp);
 		vdrop(vp);
 		sx_xlock(&dm->dm_lock);
 	} else
 		mtx_unlock(&devfs_de_interlock);
 	if (de->de_symlink) {
 		free(de->de_symlink, M_DEVFS);
 		de->de_symlink = NULL;
 	}
 #ifdef MAC
 	mac_devfs_destroy(de);
 #endif
 	if (de->de_inode > DEVFS_ROOTINO) {
 		devfs_free_cdp_inode(de->de_inode);
 		de->de_inode = 0;
 	}
 	if (DEVFS_DE_DROP(de))
 		devfs_dirent_free(de);
 
 	if (dd != NULL) {
 		if (DEVFS_DE_DROP(dd))
 			devfs_dirent_free(dd);
 		else
 			devfs_rmdir_empty(dm, dd);
 	}
 }
 
 /*
  * Called on unmount.
  * Recursively removes the entire tree.
  * The caller needs to hold the dm for the duration of the call.
  */
 
 static void
 devfs_purge(struct devfs_mount *dm, struct devfs_dirent *dd)
 {
 	struct devfs_dirent *de;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 
 	DEVFS_DE_HOLD(dd);
 	for (;;) {
 		/*
 		 * Use TAILQ_LAST() to remove "." and ".." last.
 		 * We might need ".." to resolve a path in
 		 * devfs_dir_unref_de().
 		 */
 		de = TAILQ_LAST(&dd->de_dlist, devfs_dlist_head);
 		if (de == NULL)
 			break;
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_flags & DE_USER)
 			devfs_dir_unref_de(dm, dd);
 		if (de->de_flags & (DE_DOT | DE_DOTDOT))
 			devfs_delete(dm, de, DEVFS_DEL_NORECURSE);
 		else if (de->de_dirent->d_type == DT_DIR)
 			devfs_purge(dm, de);
 		else
 			devfs_delete(dm, de, DEVFS_DEL_NORECURSE);
 	}
 	if (DEVFS_DE_DROP(dd))
 		devfs_dirent_free(dd);
 	else if ((dd->de_flags & DE_DOOMED) == 0)
 		devfs_delete(dm, dd, DEVFS_DEL_NORECURSE);
 }
 
 /*
  * Each cdev_priv has an array of pointers to devfs_dirent which is indexed
  * by the mount points dm_idx.
  * This function extends the array when necessary, taking into account that
  * the default array is 1 element and not malloc'ed.
  */
 static void
 devfs_metoo(struct cdev_priv *cdp, struct devfs_mount *dm)
 {
 	struct devfs_dirent **dep, **olddep;
 	int siz;
 
 	siz = (dm->dm_idx + 1) * sizeof *dep;
 	dep = malloc(siz, M_DEVFS2, M_WAITOK | M_ZERO);
 	dev_lock();
 	if (dm->dm_idx <= cdp->cdp_maxdirent) {
 		/* We got raced */
 		dev_unlock();
 		free(dep, M_DEVFS2);
 		return;
 	} 
 	memcpy(dep, cdp->cdp_dirents, (cdp->cdp_maxdirent + 1) * sizeof *dep);
 	olddep = cdp->cdp_maxdirent > 0 ? cdp->cdp_dirents : NULL;
 	cdp->cdp_dirents = dep;
 	/*
 	 * XXX: if malloc told us how much we actually got this could
 	 * XXX: be optimized.
 	 */
 	cdp->cdp_maxdirent = dm->dm_idx;
 	dev_unlock();
 	free(olddep, M_DEVFS2);
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 static int
 devfs_populate_loop(struct devfs_mount *dm, int cleanup)
 {
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	struct devfs_dirent *dd, *dt;
 	struct cdev *pdev;
 	int de_flags, depth, j;
 	char *q, *s;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	dev_lock();
 	TAILQ_FOREACH(cdp, &cdevp_list, cdp_list) {
 		KASSERT(cdp->cdp_dirents != NULL, ("NULL cdp_dirents"));
 		KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) != 0,
 		    ("%s: cdp %p (%s) should not be on active list",
 		    __func__, cdp, cdp->cdp_c.si_name));
 
 		/*
 		 * If we are unmounting, or the device has been destroyed,
 		 * clean up our dirent.
 		 */
 		if ((cleanup || !(cdp->cdp_flags & CDP_ACTIVE)) &&
 		    dm->dm_idx <= cdp->cdp_maxdirent &&
 		    cdp->cdp_dirents[dm->dm_idx] != NULL) {
 			de = cdp->cdp_dirents[dm->dm_idx];
 			cdp->cdp_dirents[dm->dm_idx] = NULL;
 			KASSERT(cdp == de->de_cdp,
 			    ("%s %d %s %p %p", __func__, __LINE__,
 			    cdp->cdp_c.si_name, cdp, de->de_cdp));
 			KASSERT(de->de_dir != NULL, ("Null de->de_dir"));
 			dev_unlock();
 
 			TAILQ_REMOVE(&de->de_dir->de_dlist, de, de_list);
 			de->de_cdp = NULL;
 			de->de_inode = 0;
 			devfs_delete(dm, de, 0);
 			dev_lock();
 			cdp->cdp_inuse--;
 			dev_unlock();
 			return (1);
 		}
 		/*
 	 	 * GC any lingering devices
 		 */
 		if (!(cdp->cdp_flags & CDP_ACTIVE)) {
 			if (cdp->cdp_inuse > 0)
 				continue;
 			cdp->cdp_flags &= ~CDP_ON_ACTIVE_LIST;
 			TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 			dev_unlock();
 			dev_rel(&cdp->cdp_c);
 			return (1);
 		}
 		/*
 		 * Don't create any new dirents if we are unmounting
 		 */
 		if (cleanup)
 			continue;
 		KASSERT((cdp->cdp_flags & CDP_ACTIVE), ("Bogons, I tell ya'!"));
 
 		if (dm->dm_idx <= cdp->cdp_maxdirent &&
 		    cdp->cdp_dirents[dm->dm_idx] != NULL) {
 			de = cdp->cdp_dirents[dm->dm_idx];
 			KASSERT(cdp == de->de_cdp, ("inconsistent cdp"));
 			continue;
 		}
 
 		cdp->cdp_inuse++;
 		dev_unlock();
 
 		if (dm->dm_idx > cdp->cdp_maxdirent)
 		        devfs_metoo(cdp, dm);
 
 		dd = dm->dm_rootdir;
 		s = cdp->cdp_c.si_name;
 		for (;;) {
 			for (q = s; *q != '/' && *q != '\0'; q++)
 				continue;
 			if (*q != '/')
 				break;
 			de = devfs_find(dd, s, q - s, 0);
 			if (de == NULL)
 				de = devfs_vmkdir(dm, s, q - s, dd, 0);
 			else if (de->de_dirent->d_type == DT_LNK) {
 				de = devfs_find(dd, s, q - s, DT_DIR);
 				if (de == NULL)
 					de = devfs_vmkdir(dm, s, q - s, dd, 0);
 				de->de_flags |= DE_COVERED;
 			}
 			s = q + 1;
 			dd = de;
 			KASSERT(dd->de_dirent->d_type == DT_DIR &&
 			    (dd->de_flags & (DE_DOT | DE_DOTDOT)) == 0,
 			    ("%s: invalid directory (si_name=%s)",
 			    __func__, cdp->cdp_c.si_name));
 		}
 		de_flags = 0;
 		de = devfs_find(dd, s, q - s, DT_LNK);
 		if (de != NULL)
 			de_flags |= DE_COVERED;
 
 		de = devfs_newdirent(s, q - s);
 		if (cdp->cdp_c.si_flags & SI_ALIAS) {
 			de->de_uid = 0;
 			de->de_gid = 0;
 			de->de_mode = 0755;
 			de->de_dirent->d_type = DT_LNK;
 			pdev = cdp->cdp_c.si_parent;
 			dt = dd;
 			depth = 0;
 			while (dt != dm->dm_rootdir &&
 			    (dt = devfs_parent_dirent(dt)) != NULL)
 				depth++;
 			j = depth * 3 + strlen(pdev->si_name) + 1;
 			de->de_symlink = malloc(j, M_DEVFS, M_WAITOK);
 			de->de_symlink[0] = 0;
 			while (depth-- > 0)
 				strcat(de->de_symlink, "../");
 			strcat(de->de_symlink, pdev->si_name);
 		} else {
 			de->de_uid = cdp->cdp_c.si_uid;
 			de->de_gid = cdp->cdp_c.si_gid;
 			de->de_mode = cdp->cdp_c.si_mode;
 			de->de_dirent->d_type = DT_CHR;
 		}
 		de->de_flags |= de_flags;
 		de->de_inode = cdp->cdp_inode;
 		de->de_cdp = cdp;
 #ifdef MAC
 		mac_devfs_create_device(cdp->cdp_c.si_cred, dm->dm_mount,
 		    &cdp->cdp_c, de);
 #endif
 		de->de_dir = dd;
 		TAILQ_INSERT_TAIL(&dd->de_dlist, de, de_list);
 		devfs_rules_apply(dm, de);
 		dev_lock();
 		/* XXX: could check that cdp is still active here */
 		KASSERT(cdp->cdp_dirents[dm->dm_idx] == NULL,
 		    ("%s %d\n", __func__, __LINE__));
 		cdp->cdp_dirents[dm->dm_idx] = de;
 		KASSERT(de->de_cdp != (void *)0xdeadc0de,
 		    ("%s %d\n", __func__, __LINE__));
 		dev_unlock();
 		return (1);
 	}
 	dev_unlock();
 	return (0);
 }
 
 int
 devfs_populate_needed(struct devfs_mount *dm)
 {
 
 	return (dm->dm_generation != devfs_generation);
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 void
 devfs_populate(struct devfs_mount *dm)
 {
 	unsigned gen;
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	if (!devfs_populate_needed(dm))
 		return;
 	gen = devfs_generation;
 	while (devfs_populate_loop(dm, 0))
 		continue;
 	dm->dm_generation = gen;
 }
 
 /*
  * The caller needs to hold the dm for the duration of the call.
  */
 void
 devfs_cleanup(struct devfs_mount *dm)
 {
 
 	sx_assert(&dm->dm_lock, SX_XLOCKED);
 	while (devfs_populate_loop(dm, 1))
 		continue;
 	devfs_purge(dm, dm->dm_rootdir);
 }
 
 /*
  * devfs_create() and devfs_destroy() are called from kern_conf.c and
  * in both cases the devlock() mutex is held, so no further locking
  * is necessary and no sleeping allowed.
  */
 
 void
 devfs_create(struct cdev *dev)
 {
 	struct cdev_priv *cdp;
 
 	dev_lock_assert_locked();
 	cdp = cdev2priv(dev);
 	KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) == 0,
 	    ("%s: cdp %p (%s) already on active list",
 	    __func__, cdp, dev->si_name));
 	cdp->cdp_flags |= (CDP_ACTIVE | CDP_ON_ACTIVE_LIST);
 	cdp->cdp_inode = alloc_unrl(devfs_inos);
 	dev_refl(dev);
 	TAILQ_INSERT_TAIL(&cdevp_list, cdp, cdp_list);
 	devfs_generation++;
 }
 
 void
 devfs_destroy(struct cdev *dev)
 {
 	struct cdev_priv *cdp;
 
 	dev_lock_assert_locked();
 	cdp = cdev2priv(dev);
 	cdp->cdp_flags &= ~CDP_ACTIVE;
 	devfs_generation++;
 }
 
 ino_t
 devfs_alloc_cdp_inode(void)
 {
 
 	return (alloc_unr(devfs_inos));
 }
 
 void
 devfs_free_cdp_inode(ino_t ino)
 {
 
 	if (ino > 0)
 		free_unr(devfs_inos, ino);
 }
 
 static void
 devfs_devs_init(void *junk __unused)
 {
 
 	devfs_inos = new_unrhdr(DEVFS_ROOTINO + 1, INT_MAX, &devmtx);
 }
 
 SYSINIT(devfs_devs, SI_SUB_DEVFS, SI_ORDER_FIRST, devfs_devs_init, NULL);
diff --git a/sys/geom/geom_kern.c b/sys/geom/geom_kern.c
index 14707403215d..f8f99087ad9c 100644
--- a/sys/geom/geom_kern.c
+++ b/sys/geom/geom_kern.c
@@ -1,242 +1,237 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <geom/geom.h>
 #include <geom/geom_int.h>
 
 MALLOC_DEFINE(M_GEOM, "GEOM", "Geom data structures");
 
 struct sx topology_lock;
 
 static struct proc *g_proc;
 struct thread __read_mostly *g_up_td;
 struct thread __read_mostly *g_down_td;
 static struct thread __read_mostly *g_event_td;
 
 int __read_mostly g_debugflags;
 int __read_mostly g_collectstats = G_STATS_PROVIDERS;
 int g_shutdown;
 int g_notaste;
 
 /*
  * G_UP and G_DOWN are the two threads which push I/O through the
  * stack.
  *
  * Things are procesed in a FIFO order, but these threads could be
  * part of I/O prioritization by deciding which bios/bioqs to service
  * in what order.
  *
  * We have only one thread in each direction, it is believed that until
  * a very non-trivial workload in the UP/DOWN path this will be enough,
  * but more than one can actually be run without problems.
  *
  * Holding the "mymutex" is a debugging feature:  It prevents people
  * from sleeping in the UP/DOWN I/O path by mistake or design (doing
  * so almost invariably result in deadlocks since it stalls all I/O
  * processing in the given direction.
  */
 
 static void
 g_up_procbody(void *arg)
 {
 
 	thread_lock(g_up_td);
 	sched_prio(g_up_td, PRIBIO);
 	thread_unlock(g_up_td);
 	for(;;) {
 		g_io_schedule_up(g_up_td);
 	}
 }
 
 static void
 g_down_procbody(void *arg)
 {
 
 	thread_lock(g_down_td);
 	sched_prio(g_down_td, PRIBIO);
 	thread_unlock(g_down_td);
 	for(;;) {
 		g_io_schedule_down(g_down_td);
 	}
 }
 
 static void
 g_event_procbody(void *arg)
 {
 
 	thread_lock(g_event_td);
 	sched_prio(g_event_td, PRIBIO);
 	thread_unlock(g_event_td);
 	g_run_events();
 	/* NOTREACHED */
 }
 
 int
 g_is_geom_thread(struct thread *td)
 {
 
 	return (td == g_up_td || td == g_down_td || td == g_event_td);
 }
 
 static void
 geom_shutdown(void *foo __unused)
 {
 
 	g_shutdown = 1;
 }
 
 void
 g_init(void)
 {
 
 	g_trace(G_T_TOPOLOGY, "g_ignition");
 	sx_init(&topology_lock, "GEOM topology");
 	g_io_init();
 	g_event_init();
 	g_ctl_init();
 	kproc_kthread_add(g_event_procbody, NULL, &g_proc, &g_event_td,
 	    RFHIGHPID, 0, "geom", "g_event");
 	kproc_kthread_add(g_up_procbody, NULL, &g_proc, &g_up_td,
 	    RFHIGHPID, 0, "geom", "g_up");
 	kproc_kthread_add(g_down_procbody, NULL, &g_proc, &g_down_td,
 	    RFHIGHPID, 0, "geom", "g_down");
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, geom_shutdown, NULL,
 		SHUTDOWN_PRI_FIRST);
 }
 
 static int
 sysctl_kern_geom_confany(struct sysctl_req *req, g_event_t *func, size_t *hint)
 {
 	size_t len = 0;
 	int error = 0;
 	struct sbuf *sb;
 
 	if (req->oldptr == NULL) {
 		sb = sbuf_new(NULL, NULL, PAGE_SIZE, SBUF_FIXEDLEN |
 		    SBUF_INCLUDENUL);
 		sbuf_set_drain(sb, sbuf_count_drain, &len);
 		g_waitfor_event(func, sb, M_WAITOK, NULL);
 		req->oldidx = *hint = len;
 	} else {
 		sb = sbuf_new(NULL, NULL, *hint, SBUF_AUTOEXTEND |
 		    SBUF_INCLUDENUL);
 		g_waitfor_event(func, sb, M_WAITOK, NULL);
 		*hint = sbuf_len(sb);
 		error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
 	}
 	sbuf_delete(sb);
 	return error;
 }
 
 static int
 sysctl_kern_geom_conftxt(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_conftxt, &hint));
 }
 
 static int
 sysctl_kern_geom_confdot(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_confdot, &hint));
 }
 
 static int
 sysctl_kern_geom_confxml(SYSCTL_HANDLER_ARGS)
 {
 	static size_t hint = PAGE_SIZE;
 
 	return (sysctl_kern_geom_confany(req, g_confxml, &hint));
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, geom, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "GEOMetry management");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confxml,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_geom_confxml, "",
     "Dump the GEOM config in XML");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, confdot,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_geom_confdot, "",
     "Dump the GEOM config in dot");
 
 SYSCTL_PROC(_kern_geom, OID_AUTO, conftxt,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_geom_conftxt, "",
     "Dump the GEOM config in txt");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, debugflags, CTLFLAG_RWTUN,
 	&g_debugflags, 0, "Set various trace levels for GEOM debugging");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, notaste, CTLFLAG_RW,
 	&g_notaste, 0, "Prevent GEOM tasting");
 
 SYSCTL_INT(_kern_geom, OID_AUTO, collectstats, CTLFLAG_RW,
 	&g_collectstats, 0,
 	"Control statistics collection on GEOM providers and consumers");
 
-SYSCTL_INT(_debug_sizeof, OID_AUTO, g_class, CTLFLAG_RD,
-	SYSCTL_NULL_INT_PTR, sizeof(struct g_class), "sizeof(struct g_class)");
-SYSCTL_INT(_debug_sizeof, OID_AUTO, g_geom, CTLFLAG_RD,
-	SYSCTL_NULL_INT_PTR, sizeof(struct g_geom), "sizeof(struct g_geom)");
-SYSCTL_INT(_debug_sizeof, OID_AUTO, g_provider, CTLFLAG_RD,
-	SYSCTL_NULL_INT_PTR, sizeof(struct g_provider), "sizeof(struct g_provider)");
-SYSCTL_INT(_debug_sizeof, OID_AUTO, g_consumer, CTLFLAG_RD,
-	SYSCTL_NULL_INT_PTR, sizeof(struct g_consumer), "sizeof(struct g_consumer)");
-SYSCTL_INT(_debug_sizeof, OID_AUTO, g_bioq, CTLFLAG_RD,
-	SYSCTL_NULL_INT_PTR, sizeof(struct g_bioq), "sizeof(struct g_bioq)");
+SYSCTL_SIZEOF_STRUCT(g_class);
+SYSCTL_SIZEOF_STRUCT(g_geom);
+SYSCTL_SIZEOF_STRUCT(g_provider);
+SYSCTL_SIZEOF_STRUCT(g_consumer);
+SYSCTL_SIZEOF_STRUCT(g_bioq);
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
index d60c72a00f63..f69275fc3d1d 100644
--- a/sys/kern/kern_mib.c
+++ b/sys/kern/kern_mib.c
@@ -1,780 +1,774 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 #include "opt_posix.h"
 #include "opt_config.h"
 
 #include <sys/param.h>
 #include <sys/boot.h>
 #include <sys/elf.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 
 SYSCTL_ROOT_NODE(0, sysctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sysctl internal magic");
 SYSCTL_ROOT_NODE(CTL_KERN, kern, CTLFLAG_RW | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, 0,
     "High kernel, proc, limits &c");
 SYSCTL_ROOT_NODE(CTL_VM, vm, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Virtual memory");
 SYSCTL_ROOT_NODE(CTL_VFS, vfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "File system");
 SYSCTL_ROOT_NODE(CTL_NET, net, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Network, (see socket.h)");
 SYSCTL_ROOT_NODE(CTL_DEBUG, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Debugging");
 SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sizeof various things");
 SYSCTL_ROOT_NODE(CTL_HW, hw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "hardware");
 SYSCTL_ROOT_NODE(CTL_MACHDEP, machdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "machine dependent");
 SYSCTL_NODE(_machdep, OID_AUTO, mitigations, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Machine dependent platform mitigations.");
 SYSCTL_ROOT_NODE(CTL_USER, user, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "user-level");
 SYSCTL_ROOT_NODE(CTL_P1003_1B, p1003_1b, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "p1003_1b, (see p1003_1b.h)");
 
 SYSCTL_ROOT_NODE(OID_AUTO, compat, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Compatibility code");
 SYSCTL_ROOT_NODE(OID_AUTO, security, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 
     "Security");
 #ifdef REGRESSION
 SYSCTL_ROOT_NODE(OID_AUTO, regression, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Regression test MIB");
 #endif
 
 SYSCTL_CONST_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD,
     kern_ident, "Kernel identifier");
 
 SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, BSD, "Operating system revision");
 
 SYSCTL_CONST_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD,
     version, "Kernel version");
 
 SYSCTL_CONST_STRING(_kern, OID_AUTO, compiler_version, CTLFLAG_RD,
     compiler_version, "Version of compiler used to compile kernel");
 
 SYSCTL_CONST_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD | CTLFLAG_CAPRD,
     ostype, "Operating system type");
 
 SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &maxproc, 0, "Maximum number of processes");
 
 SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
     &maxprocperuid, 0, "Maximum processes allowed per userid");
 
 SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &maxusers, 0, "Hint for kernel tuning");
 
 SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, ARG_MAX, "Maximum bytes of argument to execve(2)");
 
 SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, _POSIX_VERSION, "Version of POSIX attempting to comply to");
 
 SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RDTUN |
     CTLFLAG_NOFETCH | CTLFLAG_CAPRD, &ngroups_max, 0,
     "Maximum number of supplemental groups a user can belong to");
 
 SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 1, "Whether job control is available");
 
 #ifdef _POSIX_SAVED_IDS
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 1, "Whether saved set-group/user ID is available");
 #else
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, 0, "Whether saved set-group/user ID is available");
 #endif
 
 char kernelname[MAXPATHLEN] = PATH_KERNEL;	/* XXX bloat */
 
 SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
     kernelname, sizeof kernelname, "Name of kernel file booted");
 
 #ifdef COMPAT_FREEBSD12
 static int
 sysctl_maxphys(SYSCTL_HANDLER_ARGS)
 {
 	u_long lvalue;
 	int ivalue;
 
 	lvalue = maxphys;
 	if (sizeof(int) == sizeof(u_long) || req->oldlen >= sizeof(u_long))
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	if (lvalue > INT_MAX)
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 SYSCTL_PROC(_kern, KERN_MAXPHYS, maxphys, CTLTYPE_LONG | CTLFLAG_RDTUN |
     CTLFLAG_NOFETCH | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_maxphys, "UL", "Maximum block I/O access size");
 #else
 SYSCTL_ULONG(_kern, KERN_MAXPHYS, maxphys,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH | CTLFLAG_CAPRD,
     &maxphys, 0, "Maximum block I/O access size");
 #endif
 
 SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD|CTLFLAG_CAPRD,
     &mp_ncpus, 0, "Number of active CPUs");
 
 SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, BYTE_ORDER, "System byte order");
 
 SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD|CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, PAGE_SIZE, "System memory page size");
 
 static int
 sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
 {
 	char buf[256];
 	size_t len;
 	int error;
 
 	len = MIN(req->oldlen, sizeof(buf));
 	read_random(buf, len);
 
 	error = SYSCTL_OUT(req, buf, len);
 	explicit_bzero(buf, len);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_ARND, arandom,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD, NULL, 0,
     sysctl_kern_arnd, "", "arc4rand");
 
 static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val, p;
 
 	p = SIZE_T_MAX >> PAGE_SHIFT;
 	if (physmem < p)
 		p = physmem;
 	val = ctob(p);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem,
     CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_hw_physmem, "LU",
     "Amount of physical memory (in bytes)");
 
 static int
 sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val, p;
 
 	p = SIZE_T_MAX >> PAGE_SHIFT;
 	if (realmem < p)
 		p = realmem;
 	val = ctob(p);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 SYSCTL_PROC(_hw, HW_REALMEM, realmem,
     CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_hw_realmem, "LU",
     "Amount of memory (in bytes) reported by the firmware");
 
 static int
 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val, p, p1;
 
 	p1 = physmem - vm_wire_count();
 	p = SIZE_T_MAX >> PAGE_SHIFT;
 	if (p1 < p)
 		p = p1;
 	val = ctob(p);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 SYSCTL_PROC(_hw, HW_USERMEM, usermem,
     CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_hw_usermem, "LU",
     "Amount of memory (in bytes) which is not wired");
 
 SYSCTL_LONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0,
     "Amount of physical memory (in pages)");
 
 u_long pagesizes[MAXPAGESIZES] = { PAGE_SIZE };
 
 static int
 sysctl_hw_pagesizes(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	size_t len;
 #ifdef SCTL_MASK32
 	int i;
 	uint32_t pagesizes32[MAXPAGESIZES];
 
 	if (req->flags & SCTL_MASK32) {
 		/*
 		 * Recreate the "pagesizes" array with 32-bit elements.
 		 * Truncate any page size greater than UINT32_MAX to zero,
 		 * which assumes that page sizes are powers of two.
 		 */
 		for (i = 0; i < MAXPAGESIZES; i++)
 			pagesizes32[i] = (uint32_t)pagesizes[i];
 
 		len = sizeof(pagesizes32);
 		if (len > req->oldlen && req->oldptr != NULL)
 			len = req->oldlen;
 		error = SYSCTL_OUT(req, pagesizes32, len);
 	} else
 #endif
 	{
 		len = sizeof(pagesizes);
 		if (len > req->oldlen && req->oldptr != NULL)
 			len = req->oldlen;
 		error = SYSCTL_OUT(req, pagesizes, len);
 	}
 	return (error);
 }
 SYSCTL_PROC(_hw, OID_AUTO, pagesizes,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_hw_pagesizes, "S,pagesizes",
     "Supported page sizes");
 
 int adaptive_machine_arch = 1;
 SYSCTL_INT(_debug, OID_AUTO, adaptive_machine_arch, CTLFLAG_RW,
     &adaptive_machine_arch, 1,
     "Adapt reported machine architecture to the ABI of the binary");
 
 static const char *
 proc_machine_arch(struct proc *p)
 {
 
 	if (p->p_sysent->sv_machine_arch != NULL)
 		return (p->p_sysent->sv_machine_arch(p));
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(p, SV_ILP32))
 		return (MACHINE_ARCH32);
 #endif
 	return (MACHINE_ARCH);
 }
 
 static int
 sysctl_hw_machine_arch(SYSCTL_HANDLER_ARGS)
 {
 	const char *machine_arch;
 
 	if (adaptive_machine_arch)
 		machine_arch = proc_machine_arch(curproc);
 	else
 		machine_arch = MACHINE_ARCH;
 	return (SYSCTL_OUT(req, machine_arch, strlen(machine_arch) + 1));
 }
 SYSCTL_PROC(_hw, HW_MACHINE_ARCH, machine_arch, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine_arch, "A",
     "System architecture");
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 static int
 sysctl_kern_supported_archs(SYSCTL_HANDLER_ARGS)
 {
 	const char *supported_archs;
 
 	supported_archs =
 #ifdef COMPAT_FREEBSD32
 	    compat_freebsd_32bit ? MACHINE_ARCH " " MACHINE_ARCH32 :
 #endif
 	    MACHINE_ARCH;
 	return (SYSCTL_OUT(req, supported_archs, strlen(supported_archs) + 1));
 }
 SYSCTL_PROC(_kern, OID_AUTO, supported_archs, CTLFLAG_RD | CTLFLAG_MPSAFE |
     CTLFLAG_CAPRD | CTLTYPE_STRING, NULL, 0, sysctl_kern_supported_archs, "A",
     "Supported architectures for binaries");
 
 static int
 sysctl_hostname(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	size_t pr_offset;
 	char tmpname[MAXHOSTNAMELEN];
 	int descend, error, len;
 
 	/*
 	 * This function can set: hostname domainname hostuuid.
 	 * Keep that in mind when comments say "hostname".
 	 */
 	pr_offset = (size_t)arg1;
 	len = arg2;
 	KASSERT(len <= sizeof(tmpname),
 	    ("length %d too long for %s", len, __func__));
 
 	/*
 	 * Make a local copy of hostname to get/set so we don't have to hold
 	 * the jail mutex during the sysctl copyin/copyout activities.
 	 */
 	pr = req->td->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	bcopy((char *)pr + pr_offset, tmpname, len);
 	mtx_unlock(&pr->pr_mtx);
 
 	error = sysctl_handle_string(oidp, tmpname, len, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	/*
 	 * Copy the locally set hostname to all jails that share
 	 * this host info.
 	 */
 	sx_slock(&allprison_lock);
 	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME))
 		error = EPERM;
 	else {
 		while (!(pr->pr_flags & PR_HOST))
 			pr = pr->pr_parent;
 		mtx_lock(&pr->pr_mtx);
 		bcopy(tmpname, (char *)pr + pr_offset, len);
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			if (cpr->pr_flags & PR_HOST)
 				descend = 0;
 			else
 				bcopy(tmpname, (char *)cpr + pr_offset, len);
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_hostname)), MAXHOSTNAMELEN,
     sysctl_hostname, "A", "Hostname");
 SYSCTL_PROC(_kern, KERN_NISDOMAINNAME, domainname,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_domainname)), MAXHOSTNAMELEN,
     sysctl_hostname, "A", "Name of the current YP/NIS domain");
 SYSCTL_PROC(_kern, KERN_HOSTUUID, hostuuid,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_CAPRD | CTLFLAG_MPSAFE,
     (void *)(offsetof(struct prison, pr_hostuuid)), HOSTUUIDLEN,
     sysctl_hostname, "A", "Host UUID");
 
 static int	regression_securelevel_nonmonotonic = 0;
 
 #ifdef REGRESSION
 SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
     &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
 #endif
 
 static int
 sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	int descend, error, level;
 
 	pr = req->td->td_ucred->cr_prison;
 
 	/*
 	 * Reading the securelevel is easy, since the current jail's level
 	 * is known to be at least as secure as any higher levels.  Perform
 	 * a lockless read since the securelevel is an integer.
 	 */
 	level = pr->pr_securelevel;
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	/* Permit update only if the new securelevel exceeds the old. */
 	sx_slock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	if (!regression_securelevel_nonmonotonic &&
 	    level < pr->pr_securelevel) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EPERM);
 	}
 	pr->pr_securelevel = level;
 	/*
 	 * Set all child jails to be at least this level, but do not lower
 	 * them (even if regression_securelevel_nonmonotonic).
 	 */
 	FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) {
 		if (cpr->pr_securelevel < level)
 			cpr->pr_securelevel = level;
 	}
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_securelvl, "I",
     "Current secure level");
 
 #ifdef INCLUDE_CONFIG_FILE
 /* Actual kernel configuration options. */
 extern const char kernconfstring[];
 
 SYSCTL_CONST_STRING(_kern, OID_AUTO, conftxt, CTLFLAG_RD,
     kernconfstring, "Kernel configuration file");
 #endif
 
 static int
 sysctl_hostid(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *cpr;
 	u_long tmpid;
 	int descend, error;
 
 	/*
 	 * Like sysctl_hostname, except it operates on a u_long
 	 * instead of a string, and is used only for hostid.
 	 */
 	pr = req->td->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	tmpid = pr->pr_hostid;
 	mtx_unlock(&pr->pr_mtx);
 
 	error = sysctl_handle_long(oidp, &tmpid, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	sx_slock(&allprison_lock);
 	if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME))
 		error = EPERM;
 	else {
 		while (!(pr->pr_flags & PR_HOST))
 			pr = pr->pr_parent;
 		mtx_lock(&pr->pr_mtx);
 		pr->pr_hostid = tmpid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			if (cpr->pr_flags & PR_HOST)
 				descend = 0;
 			else
 				cpr->pr_hostid = tmpid;
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTID, hostid,
     CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
     NULL, 0, sysctl_hostid, "LU", "Host ID");
 
 static struct mtx bootid_lk;
 MTX_SYSINIT(bootid_lock, &bootid_lk, "bootid generator lock", MTX_DEF);
 
 static int
 sysctl_bootid(SYSCTL_HANDLER_ARGS)
 {
 	static uint8_t boot_id[16];
 	static bool initialized = false;
 
 	mtx_lock(&bootid_lk);
 	if (!initialized) {
 		if (!is_random_seeded()) {
 			mtx_unlock(&bootid_lk);
 			return (ENXIO);
 		}
 		arc4random_buf(boot_id, sizeof(boot_id));
 		initialized = true;
 	}
 	mtx_unlock(&bootid_lk);
 
 	return (SYSCTL_OUT(req, boot_id, sizeof(boot_id)));
 }
 SYSCTL_PROC(_kern, OID_AUTO, boot_id,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_CAPRD,
     NULL, 0, sysctl_bootid, "", "Random boot ID");
 
 /*
  * The osrelease string is copied from the global (osrelease in vers.c) into
  * prison0 by a sysinit and is inherited by child jails if not changed at jail
  * creation, so we always return the copy from the current prison data.
  */
 static int
 sysctl_osrelease(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 
 	pr = req->td->td_ucred->cr_prison;
 	return (SYSCTL_OUT(req, pr->pr_osrelease, strlen(pr->pr_osrelease) + 1));
 
 }
 
 SYSCTL_PROC(_kern, KERN_OSRELEASE, osrelease,
     CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_osrelease, "A", "Operating system release");
 
 /*
  * The osreldate number is copied from the global (osreldate in vers.c) into
  * prison0 by a sysinit and is inherited by child jails if not changed at jail
  * creation, so we always return the value from the current prison data.
  */
 static int
 sysctl_osreldate(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 
 	pr = req->td->td_ucred->cr_prison;
 	return (SYSCTL_OUT(req, &pr->pr_osreldate, sizeof(pr->pr_osreldate)));
 
 }
 
 /*
  * NOTICE: The *userland* release date is available in
  * /usr/include/osreldate.h
  */
 SYSCTL_PROC(_kern, KERN_OSRELDATE, osreldate,
     CTLTYPE_INT | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_osreldate, "I", "Kernel release date");
 
 /*
  * The build-id is copied from the ELF section .note.gnu.build-id.  The linker
  * script defines two variables to expose the beginning and end.  LLVM
  * currently uses a SHA-1 hash, but other formats can be supported by checking
  * the length of the section.
  */
 
 extern char __build_id_start[];
 extern char __build_id_end[];
 
 #define	BUILD_ID_HEADER_LEN	0x10
 #define	BUILD_ID_HASH_MAXLEN	0x14
 
 static int
 sysctl_build_id(SYSCTL_HANDLER_ARGS)
 {
 	uintptr_t sectionlen = (uintptr_t)(__build_id_end - __build_id_start);
 	int hashlen;
 	char buf[2*BUILD_ID_HASH_MAXLEN+1];
 
 	/*
 	 * The ELF note section has a four byte length for the vendor name,
 	 * four byte length for the value, and a four byte vendor specific
 	 * type.  The name for the build id is "GNU\0".  We skip the first 16
 	 * bytes to read the build hash.  We will return the remaining bytes up
 	 * to 20 (SHA-1) hash size.  If the hash happens to be a custom number
 	 * of bytes we will pad the value with zeros, as the section should be
 	 * four byte aligned.
 	 */
 	if (sectionlen <= BUILD_ID_HEADER_LEN ||
 	    sectionlen > (BUILD_ID_HEADER_LEN + BUILD_ID_HASH_MAXLEN)) {
 		return (ENOENT);
 	}
 
 	hashlen = sectionlen - BUILD_ID_HEADER_LEN;
 	for (int i = 0; i < hashlen; i++) {
 		uint8_t c = __build_id_start[i+BUILD_ID_HEADER_LEN];
 		snprintf(&buf[2*i], 3, "%02x", c);
 	}
 
 	return (SYSCTL_OUT(req, buf, strlen(buf) + 1));
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, build_id,
     CTLTYPE_STRING | CTLFLAG_CAPRD | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_build_id, "A", "Operating system build-id");
 
 SYSCTL_NODE(_kern, OID_AUTO, features, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Kernel Features");
 
 #ifdef COMPAT_FREEBSD4
 FEATURE(compat_freebsd4, "Compatible with FreeBSD 4");
 #endif
 
 #ifdef COMPAT_FREEBSD5
 FEATURE(compat_freebsd5, "Compatible with FreeBSD 5");
 #endif
 
 #ifdef COMPAT_FREEBSD6
 FEATURE(compat_freebsd6, "Compatible with FreeBSD 6");
 #endif
 
 #ifdef COMPAT_FREEBSD7
 FEATURE(compat_freebsd7, "Compatible with FreeBSD 7");
 #endif
 
 #ifdef COMPAT_FREEBSD8
 FEATURE(compat_freebsd8, "Compatible with FreeBSD 8");
 #endif
 
 #ifdef COMPAT_FREEBSD9
 FEATURE(compat_freebsd9, "Compatible with FreeBSD 9");
 #endif
 
 #ifdef COMPAT_FREEBSD10
 FEATURE(compat_freebsd10, "Compatible with FreeBSD 10");
 #endif
 
 #ifdef COMPAT_FREEBSD11
 FEATURE(compat_freebsd11, "Compatible with FreeBSD 11");
 #endif
 
 #ifdef COMPAT_FREEBSD12
 FEATURE(compat_freebsd12, "Compatible with FreeBSD 12");
 #endif
 
 #ifdef COMPAT_FREEBSD13
 FEATURE(compat_freebsd13, "Compatible with FreeBSD 13");
 #endif
 
 /*
  * This is really cheating.  These actually live in the libc, something
  * which I'm not quite sure is a good idea anyway, but in order for
  * getnext and friends to actually work, we define dummies here.
  *
  * XXXRW: These probably should be CTLFLAG_CAPRD.
  */
 SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
     "", 0, "PATH that finds all the standard utilities");
 SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max ibase/obase values in bc(1)");
 SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max array size in bc(1)");
 SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max scale value in bc(1)");
 SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max string length in bc(1)");
 SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
 SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "");
 SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Max length (bytes) of a text-processing utility's input line");
 SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Maximum number of repeats of a regexp permitted");
 SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0,
     "The version of POSIX 1003.2 with which the system attempts to comply");
 SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether C development supports the C bindings option");
 SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports the C development utilities option");
 SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "");
 SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN development utilities");
 SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports FORTRAN runtime utilities");
 SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports creation of locales");
 SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports software development utilities");
 SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Whether system supports the user portability utilities");
 SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of streams a process may have open at one time");
 SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 0, "Min Maximum number of types supported for timezone names");
 
 static char localbase[MAXPATHLEN] = "";
 
 SYSCTL_STRING(_user, USER_LOCALBASE, localbase, CTLFLAG_RWTUN,
     localbase, sizeof(localbase), "Prefix used to install and locate add-on packages");
 
 #include <sys/vnode.h>
-SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct vnode), "sizeof(struct vnode)");
+SYSCTL_SIZEOF_STRUCT(vnode);
 
-SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct proc), "sizeof(struct proc)");
+SYSCTL_SIZEOF_STRUCT(proc);
 
 static int
 sysctl_kern_pid_max(SYSCTL_HANDLER_ARGS)
 {
 	int error, pm;
 
 	pm = pid_max;
 	error = sysctl_handle_int(oidp, &pm, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	sx_xlock(&proctree_lock);
 	sx_xlock(&allproc_lock);
 
 	/*
 	 * Only permit the values less then PID_MAX.
 	 * As a safety measure, do not allow to limit the pid_max too much.
 	 */
 	if (pm < 300 || pm > PID_MAX)
 		error = EINVAL;
 	else
 		pid_max = pm;
 	sx_xunlock(&allproc_lock);
 	sx_xunlock(&proctree_lock);
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, pid_max, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_pid_max, "I", "Maximum allowed pid");
 
 #include <sys/bio.h>
 #include <sys/buf.h>
-SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct bio), "sizeof(struct bio)");
-SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct buf), "sizeof(struct buf)");
+SYSCTL_SIZEOF_STRUCT(bio);
+SYSCTL_SIZEOF_STRUCT(buf);
 
 #include <sys/user.h>
-SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
+SYSCTL_SIZEOF_STRUCT(kinfo_proc);
 
 /* Used by kernel debuggers. */
 const int pcb_size = sizeof(struct pcb);
-SYSCTL_INT(_debug_sizeof, OID_AUTO, pcb, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct pcb), "sizeof(struct pcb)");
+SYSCTL_SIZEOF_STRUCT(pcb);
 
 /* XXX compatibility, remove for 6.0 */
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
     &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
     "compatibility for kern.fallback_elf_brand");
diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c
index 6c0977d1cc35..3a107ac30390 100644
--- a/sys/kern/subr_devstat.c
+++ b/sys/kern/subr_devstat.c
@@ -1,604 +1,603 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include <sys/param.h>
 #include <sys/disk.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/devicestat.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/conf.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/atomic.h>
 
 SDT_PROVIDER_DEFINE(io);
 
 SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *");
 SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *");
 
 #define	DTRACE_DEVSTAT_BIO_START()	SDT_PROBE2(io, , , start, bp, ds)
 #define	DTRACE_DEVSTAT_BIO_DONE()	SDT_PROBE2(io, , , done, bp, ds)
 
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
 static int devstat_current_devnumber;
 static struct mtx devstat_mutex;
 MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
 
 static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
 static struct devstat *devstat_alloc(void);
 static void devstat_free(struct devstat *);
 static void devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		       int unit_number, uint32_t block_size,
 		       devstat_support_flags flags,
 		       devstat_type_flags device_type,
 		       devstat_priority priority);
 
 /*
  * Allocate a devstat and initialize it
  */
 struct devstat *
 devstat_new_entry(const void *dev_name,
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstat *ds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	ds = devstat_alloc();
 	mtx_lock(&devstat_mutex);
 	if (unit_number == -1) {
 		ds->unit_number = unit_number;
 		ds->id = dev_name;
 		binuptime(&ds->creation_time);
 		devstat_generation++;
 	} else {
 		devstat_add_entry(ds, dev_name, unit_number, block_size,
 				  flags, device_type, priority);
 	}
 	mtx_unlock(&devstat_mutex);
 	return (ds);
 }
 
 /*
  * Take a malloced and zeroed devstat structure given to us, fill it in 
  * and add it to the queue of devices.  
  */
 static void
 devstat_add_entry(struct devstat *ds, const void *dev_name, 
 		  int unit_number, uint32_t block_size,
 		  devstat_support_flags flags,
 		  devstat_type_flags device_type,
 		  devstat_priority priority)
 {
 	struct devstatlist *devstat_head;
 	struct devstat *ds_tmp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	devstat_num_devs++;
 
 	devstat_head = &device_statq;
 
 	/*
 	 * Priority sort.  Each driver passes in its priority when it adds
 	 * its devstat entry.  Drivers are sorted first by priority, and
 	 * then by probe order.
 	 * 
 	 * For the first device, we just insert it, since the priority
 	 * doesn't really matter yet.  Subsequent devices are inserted into
 	 * the list using the order outlined above.
 	 */
 	if (devstat_num_devs == 1)
 		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
 	else {
 		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
 			struct devstat *ds_next;
 
 			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
 
 			/*
 			 * If we find a break between higher and lower
 			 * priority items, and if this item fits in the
 			 * break, insert it.  This also applies if the
 			 * "lower priority item" is the end of the list.
 			 */
 			if ((priority <= ds_tmp->priority)
 			 && ((ds_next == NULL)
 			   || (priority > ds_next->priority))) {
 				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
 						    dev_links);
 				break;
 			} else if (priority > ds_tmp->priority) {
 				/*
 				 * If this is the case, we should be able
 				 * to insert ourselves at the head of the
 				 * list.  If we can't, something is wrong.
 				 */
 				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
 					STAILQ_INSERT_HEAD(devstat_head,
 							   ds, dev_links);
 					break;
 				} else {
 					STAILQ_INSERT_TAIL(devstat_head,
 							   ds, dev_links);
 					printf("devstat_add_entry: HELP! "
 					       "sorting problem detected "
 					       "for name %p unit %d\n",
 					       dev_name, unit_number);
 					break;
 				}
 			}
 		}
 	}
 
 	ds->device_number = devstat_current_devnumber++;
 	ds->unit_number = unit_number;
 	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
 	ds->block_size = block_size;
 	ds->flags = flags;
 	ds->device_type = device_type;
 	ds->priority = priority;
 	binuptime(&ds->creation_time);
 	devstat_generation++;
 }
 
 /*
  * Remove a devstat structure from the list of devices.
  */
 void
 devstat_remove_entry(struct devstat *ds)
 {
 	struct devstatlist *devstat_head;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (ds == NULL)
 		return;
 
 	mtx_lock(&devstat_mutex);
 
 	devstat_head = &device_statq;
 
 	/* Remove this entry from the devstat queue */
 	atomic_add_acq_int(&ds->sequence1, 1);
 	if (ds->unit_number != -1) {
 		devstat_num_devs--;
 		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
 	}
 	devstat_free(ds);
 	devstat_generation++;
 	mtx_unlock(&devstat_mutex);
 }
 
 /*
  * Record a transaction start.
  *
  * See comments for devstat_end_transaction().  Ordering is very important
  * here.
  */
 void
 devstat_start_transaction(struct devstat *ds, const struct bintime *now)
 {
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/*
 	 * We only want to set the start time when we are going from idle
 	 * to busy.  The start time is really the start of the latest busy
 	 * period.
 	 */
 	if (atomic_fetchadd_int(&ds->start_count, 1) == ds->end_count) {
 		if (now != NULL)
 			ds->busy_from = *now;
 		else
 			binuptime(&ds->busy_from);
 	}
 	atomic_add_rel_int(&ds->sequence0, 1);
 }
 
 void
 devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
 {
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	binuptime(&bp->bio_t0);
 	devstat_start_transaction_bio_t0(ds, bp);
 }
 
 void
 devstat_start_transaction_bio_t0(struct devstat *ds, struct bio *bp)
 {
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	devstat_start_transaction(ds, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_START();
 }
 
 /*
  * Record the ending of a transaction, and incrment the various counters.
  *
  * Ordering in this function, and in devstat_start_transaction() is VERY
  * important.  The idea here is to run without locks, so we are very
  * careful to only modify some fields on the way "down" (i.e. at
  * transaction start) and some fields on the way "up" (i.e. at transaction
  * completion).  One exception is busy_from, which we only modify in
  * devstat_start_transaction() when there are no outstanding transactions,
  * and thus it can't be modified in devstat_end_transaction()
  * simultaneously.
  *
  * The sequence0 and sequence1 fields are provided to enable an application
  * spying on the structures with mmap(2) to tell when a structure is in a
  * consistent state or not.
  *
  * For this to work 100% reliably, it is important that the two fields
  * are at opposite ends of the structure and that they are incremented
  * in the opposite order of how a memcpy(3) in userland would copy them.
  * We assume that the copying happens front to back, but there is actually
  * no way short of writing your own memcpy(3) replacement to guarantee
  * this will be the case.
  *
  * In addition to this, being a kind of locks, they must be updated with
  * atomic instructions using appropriate memory barriers.
  */
 void
 devstat_end_transaction(struct devstat *ds, uint32_t bytes, 
 			devstat_tag_type tag_type, devstat_trans_flags flags,
 			const struct bintime *now, const struct bintime *then)
 {
 	struct bintime dt, lnow;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (now == NULL) {
 		binuptime(&lnow);
 		now = &lnow;
 	}
 
 	atomic_add_acq_int(&ds->sequence1, 1);
 	/* Update byte and operations counts */
 	ds->bytes[flags] += bytes;
 	ds->operations[flags]++;
 
 	/*
 	 * Keep a count of the various tag types sent.
 	 */
 	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
 	    tag_type != DEVSTAT_TAG_NONE)
 		ds->tag_types[tag_type]++;
 
 	if (then != NULL) {
 		/* Update duration of operations */
 		dt = *now;
 		bintime_sub(&dt, then);
 		bintime_add(&ds->duration[flags], &dt);
 	}
 
 	/* Accumulate busy time */
 	dt = *now;
 	bintime_sub(&dt, &ds->busy_from);
 	bintime_add(&ds->busy_time, &dt);
 	ds->busy_from = *now;
 
 	ds->end_count++;
 	atomic_add_rel_int(&ds->sequence0, 1);
 }
 
 void
 devstat_end_transaction_bio(struct devstat *ds, const struct bio *bp)
 {
 
 	devstat_end_transaction_bio_bt(ds, bp, NULL);
 }
 
 void
 devstat_end_transaction_bio_bt(struct devstat *ds, const struct bio *bp,
     const struct bintime *now)
 {
 	devstat_trans_flags flg;
 	devstat_tag_type tag;
 
 	/* sanity check */
 	if (ds == NULL)
 		return;
 
 	if (bp->bio_flags & BIO_ORDERED)
 		tag = DEVSTAT_TAG_ORDERED;
 	else
 		tag = DEVSTAT_TAG_SIMPLE;
 	if (bp->bio_cmd == BIO_DELETE)
 		flg = DEVSTAT_FREE;
 	else if ((bp->bio_cmd == BIO_READ)
 	      || ((bp->bio_cmd == BIO_ZONE)
 	       && (bp->bio_zone.zone_cmd == DISK_ZONE_REPORT_ZONES)))
 		flg = DEVSTAT_READ;
 	else if (bp->bio_cmd == BIO_WRITE)
 		flg = DEVSTAT_WRITE;
 	else 
 		flg = DEVSTAT_NO_DATA;
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
 				tag, flg, now, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_DONE();
 }
 
 /*
  * This is the sysctl handler for the devstat package.  The data pushed out
  * on the kern.devstat.all sysctl variable consists of the current devstat
  * generation number, and then an array of devstat structures, one for each
  * device in the system.
  *
  * This is more cryptic that obvious, but basically we neither can nor
  * want to hold the devstat_mutex for any amount of time, so we grab it
  * only when we need to and keep an eye on devstat_generation all the time.
  */
 static int
 sysctl_devstat(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long mygen;
 	struct devstat *nds;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 
 	/*
 	 * XXX devstat_generation should really be "volatile" but that
 	 * XXX freaks out the sysctl macro below.  The places where we
 	 * XXX change it and inspect it are bracketed in the mutex which
 	 * XXX guarantees us proper write barriers.  I don't believe the
 	 * XXX compiler is allowed to optimize mygen away across calls
 	 * XXX to other functions, so the following is belived to be safe.
 	 */
 	mygen = devstat_generation;
 
 	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
 
 	if (devstat_num_devs == 0)
 		return(0);
 
 	if (error != 0)
 		return (error);
 
 	mtx_lock(&devstat_mutex);
 	nds = STAILQ_FIRST(&device_statq); 
 	if (mygen != devstat_generation)
 		error = EBUSY;
 	mtx_unlock(&devstat_mutex);
 
 	if (error != 0)
 		return (error);
 
 	for (;nds != NULL;) {
 		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
 		if (error != 0)
 			return (error);
 		mtx_lock(&devstat_mutex);
 		if (mygen != devstat_generation)
 			error = EBUSY;
 		else
 			nds = STAILQ_NEXT(nds, dev_links);
 		mtx_unlock(&devstat_mutex);
 		if (error != 0)
 			return (error);
 	}
 	return(error);
 }
 
 /*
  * Sysctl entries for devstat.  The first one is a node that all the rest
  * hang off of. 
  */
 static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "Device Statistics");
 
 SYSCTL_PROC(_kern_devstat, OID_AUTO, all,
     CTLFLAG_RD | CTLTYPE_OPAQUE | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_devstat, "S,devstat",
     "All devices in the devstat list");
 /*
  * Export the number of devices in the system so that userland utilities
  * can determine how much memory to allocate to hold all the devices.
  */
 SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 
     &devstat_num_devs, 0, "Number of devices in the devstat list");
 SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
     &devstat_generation, 0, "Devstat list generation");
 SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 
     &devstat_version, 0, "Devstat list version number");
 
 /*
  * Allocator for struct devstat structures.  We sub-allocate these from pages
  * which we get from malloc.  These pages are exported for mmap(2)'ing through
  * a miniature device driver
  */
 
 #define statsperpage (PAGE_SIZE / sizeof(struct devstat))
 
 static d_ioctl_t devstat_ioctl;
 static d_mmap_t devstat_mmap;
 
 static struct cdevsw devstat_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	devstat_ioctl,
 	.d_mmap =	devstat_mmap,
 	.d_name =	"devstat",
 };
 
 struct statspage {
 	TAILQ_ENTRY(statspage)	list;
 	struct devstat		*stat;
 	u_int			nfree;
 };
 
 static size_t pagelist_pages = 0;
 static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
 static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
 
 static int
 devstat_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
     struct thread *td)
 {
 	int error = ENOTTY;
 
 	switch (cmd) {
 	case DIOCGMEDIASIZE:
 		error = 0;
 		*(off_t *)data = pagelist_pages * PAGE_SIZE;
 		break;
 	}
 
 	return (error);
 }
 
 static int
 devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int nprot, vm_memattr_t *memattr)
 {
 	struct statspage *spp;
 
 	if (nprot != VM_PROT_READ)
 		return (-1);
 	mtx_lock(&devstat_mutex);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (offset == 0) {
 			*paddr = vtophys(spp->stat);
 			mtx_unlock(&devstat_mutex);
 			return (0);
 		}
 		offset -= PAGE_SIZE;
 	}
 	mtx_unlock(&devstat_mutex);
 	return (-1);
 }
 
 static struct devstat *
 devstat_alloc(void)
 {
 	struct devstat *dsp;
 	struct statspage *spp, *spp2;
 	u_int u;
 	static int once;
 
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (!once) {
 		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
 		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444,
 		    DEVSTAT_DEVICE_NAME);
 		once = 1;
 	}
 	spp2 = NULL;
 	mtx_lock(&devstat_mutex);
 	for (;;) {
 		TAILQ_FOREACH(spp, &pagelist, list) {
 			if (spp->nfree > 0)
 				break;
 		}
 		if (spp != NULL)
 			break;
 		mtx_unlock(&devstat_mutex);
 		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
 		spp2->nfree = statsperpage;
 
 		/*
 		 * If free statspages were added while the lock was released
 		 * just reuse them.
 		 */
 		mtx_lock(&devstat_mutex);
 		TAILQ_FOREACH(spp, &pagelist, list)
 			if (spp->nfree > 0)
 				break;
 		if (spp == NULL) {
 			spp = spp2;
 
 			/*
 			 * It would make more sense to add the new page at the
 			 * head but the order on the list determine the
 			 * sequence of the mapping so we can't do that.
 			 */
 			pagelist_pages++;
 			TAILQ_INSERT_TAIL(&pagelist, spp, list);
 		} else
 			break;
 	}
 	dsp = spp->stat;
 	for (u = 0; u < statsperpage; u++) {
 		if (dsp->allocated == 0)
 			break;
 		dsp++;
 	}
 	spp->nfree--;
 	dsp->allocated = 1;
 	mtx_unlock(&devstat_mutex);
 	if (spp2 != NULL && spp2 != spp) {
 		free(spp2->stat, M_DEVSTAT);
 		free(spp2, M_DEVSTAT);
 	}
 	return (dsp);
 }
 
 static void
 devstat_free(struct devstat *dsp)
 {
 	struct statspage *spp;
 
 	mtx_assert(&devstat_mutex, MA_OWNED);
 	bzero(dsp, sizeof *dsp);
 	TAILQ_FOREACH(spp, &pagelist, list) {
 		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
 			spp->nfree++;
 			return;
 		}
 	}
 }
 
-SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
-    SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)");
+SYSCTL_SIZEOF_STRUCT(devstat);
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 6f92130e12b3..300173347401 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -1,6310 +1,6309 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/seqc.h>
 #include <sys/sdt.h>
 #include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #include <ck_queue.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef INVARIANTS
 #include <machine/_inttypes.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 
 /*
  * High level overview of name caching in the VFS layer.
  *
  * Originally caching was implemented as part of UFS, later extracted to allow
  * use by other filesystems. A decision was made to make it optional and
  * completely detached from the rest of the kernel, which comes with limitations
  * outlined near the end of this comment block.
  *
  * This fundamental choice needs to be revisited. In the meantime, the current
  * state is described below. Significance of all notable routines is explained
  * in comments placed above their implementation. Scattered thoroughout the
  * file are TODO comments indicating shortcomings which can be fixed without
  * reworking everything (most of the fixes will likely be reusable). Various
  * details are omitted from this explanation to not clutter the overview, they
  * have to be checked by reading the code and associated commentary.
  *
  * Keep in mind that it's individual path components which are cached, not full
  * paths. That is, for a fully cached path "foo/bar/baz" there are 3 entries,
  * one for each name.
  *
  * I. Data organization
  *
  * Entries are described by "struct namecache" objects and stored in a hash
  * table. See cache_get_hash for more information.
  *
  * "struct vnode" contains pointers to source entries (names which can be found
  * when traversing through said vnode), destination entries (names of that
  * vnode (see "Limitations" for a breakdown on the subject) and a pointer to
  * the parent vnode.
  *
  * The (directory vnode; name) tuple reliably determines the target entry if
  * it exists.
  *
  * Since there are no small locks at this time (all are 32 bytes in size on
  * LP64), the code works around the problem by introducing lock arrays to
  * protect hash buckets and vnode lists.
  *
  * II. Filesystem integration
  *
  * Filesystems participating in name caching do the following:
  * - set vop_lookup routine to vfs_cache_lookup
  * - set vop_cachedlookup to whatever can perform the lookup if the above fails
  * - if they support lockless lookup (see below), vop_fplookup_vexec and
  *   vop_fplookup_symlink are set along with the MNTK_FPLOOKUP flag on the
  *   mount point
  * - call cache_purge or cache_vop_* routines to eliminate stale entries as
  *   applicable
  * - call cache_enter to add entries depending on the MAKEENTRY flag
  *
  * With the above in mind, there are 2 entry points when doing lookups:
  * - ... -> namei -> cache_fplookup -- this is the default
  * - ... -> VOP_LOOKUP -> vfs_cache_lookup -- normally only called by namei
  *   should the above fail
  *
  * Example code flow how an entry is added:
  * ... -> namei -> cache_fplookup -> cache_fplookup_noentry -> VOP_LOOKUP ->
  * vfs_cache_lookup -> VOP_CACHEDLOOKUP -> ufs_lookup_ino -> cache_enter
  *
  * III. Performance considerations
  *
  * For lockless case forward lookup avoids any writes to shared areas apart
  * from the terminal path component. In other words non-modifying lookups of
  * different files don't suffer any scalability problems in the namecache.
  * Looking up the same file is limited by VFS and goes beyond the scope of this
  * file.
  *
  * At least on amd64 the single-threaded bottleneck for long paths is hashing
  * (see cache_get_hash). There are cases where the code issues acquire fence
  * multiple times, they can be combined on architectures which suffer from it.
  *
  * For locked case each encountered vnode has to be referenced and locked in
  * order to be handed out to the caller (normally that's namei). This
  * introduces significant hit single-threaded and serialization multi-threaded.
  *
  * Reverse lookup (e.g., "getcwd") fully scales provided it is fully cached --
  * avoids any writes to shared areas to any components.
  *
  * Unrelated insertions are partially serialized on updating the global entry
  * counter and possibly serialized on colliding bucket or vnode locks.
  *
  * IV. Observability
  *
  * Note not everything has an explicit dtrace probe nor it should have, thus
  * some of the one-liners below depend on implementation details.
  *
  * Examples:
  *
  * # Check what lookups failed to be handled in a lockless manner. Column 1 is
  * # line number, column 2 is status code (see cache_fpl_status)
  * dtrace -n 'vfs:fplookup:lookup:done { @[arg1, arg2] = count(); }'
  *
  * # Lengths of names added by binary name
  * dtrace -n 'fbt::cache_enter_time:entry { @[execname] = quantize(args[2]->cn_namelen); }'
  *
  * # Same as above but only those which exceed 64 characters
  * dtrace -n 'fbt::cache_enter_time:entry /args[2]->cn_namelen > 64/ { @[execname] = quantize(args[2]->cn_namelen); }'
  *
  * # Who is performing lookups with spurious slashes (e.g., "foo//bar") and what
  * # path is it
  * dtrace -n 'fbt::cache_fplookup_skip_slashes:entry { @[execname, stringof(args[0]->cnp->cn_pnbuf)] = count(); }'
  *
  * V. Limitations and implementation defects
  *
  * - since it is possible there is no entry for an open file, tools like
  *   "procstat" may fail to resolve fd -> vnode -> path to anything
  * - even if a filesystem adds an entry, it may get purged (e.g., due to memory
  *   shortage) in which case the above problem applies
  * - hardlinks are not tracked, thus if a vnode is reachable in more than one
  *   way, resolving a name may return a different path than the one used to
  *   open it (even if said path is still valid)
  * - by default entries are not added for newly created files
  * - adding an entry may need to evict negative entry first, which happens in 2
  *   distinct places (evicting on lookup, adding in a later VOP) making it
  *   impossible to simply reuse it
  * - there is a simple scheme to evict negative entries as the cache is approaching
  *   its capacity, but it is very unclear if doing so is a good idea to begin with
  * - vnodes are subject to being recycled even if target inode is left in memory,
  *   which loses the name cache entries when it perhaps should not. in case of tmpfs
  *   names get duplicated -- kept by filesystem itself and namecache separately
  * - struct namecache has a fixed size and comes in 2 variants, often wasting space.
  *   now hard to replace with malloc due to dependence on SMR.
  * - lack of better integration with the kernel also turns nullfs into a layered
  *   filesystem instead of something which can take advantage of caching
  */
 
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache");
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, enter, duplicate, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, fullpath_smr, hit, "struct vnode *",
     "const char *");
 SDT_PROBE_DEFINE4(vfs, namecache, fullpath_smr, miss, "struct vnode *",
     "struct namecache *", "int", "int");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
     "char *", "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, hit, "struct vnode *",
     "struct componentname *");
 SDT_PROBE_DEFINE2(vfs, namecache, removecnp, miss, "struct vnode *",
     "struct componentname *");
 SDT_PROBE_DEFINE3(vfs, namecache, purge, done, "struct vnode *", "size_t", "size_t");
 SDT_PROBE_DEFINE1(vfs, namecache, purge, batch, "int");
 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, symlink, alloc__fail, "size_t");
 
 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
 SDT_PROBE_DECLARE(vfs, namei, lookup, entry);
 SDT_PROBE_DECLARE(vfs, namei, lookup, return);
 
 static char __read_frequently cache_fast_lookup_enabled = true;
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 struct negstate {
 	u_char neg_flag;
 	u_char neg_hit;
 };
 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
     "the state must fit in a union with a pointer without growing it");
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	union {
 		struct	vnode *nu_vp;	/* vnode the name refers to */
 		struct	negstate nu_neg;/* negative entry state */
 	} n_un;
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[];		/* segment name + nul */
 };
 
 /*
  * struct namecache_ts repeats struct namecache layout up to the
  * nc_nlen member.
  * struct namecache_ts is used in place of struct namecache when time(s) need
  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  * both a non-dotdot directory name plus dotdot for the directory's
  * parent.
  *
  * See below for alignment requirement.
  */
 struct	namecache_ts {
 	struct	timespec nc_time;	/* timespec provided by fs */
 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
 	int	nc_ticks;		/* ticks value when entry was added */
 	int	nc_pad;
 	struct namecache nc_nc;
 };
 
 TAILQ_HEAD(cache_freebatch, namecache);
 
 /*
  * At least mips n32 performs 64-bit accesses to timespec as found
  * in namecache_ts and requires them to be aligned. Since others
  * may be in the same spot suffer a little bit and enforce the
  * alignment for everyone. Note this is a nop for 64-bit platforms.
  */
 #define CACHE_ZONE_ALIGNMENT	UMA_ALIGNOF(time_t)
 
 /*
  * TODO: the initial value of CACHE_PATH_CUTOFF was inherited from the
  * 4.4 BSD codebase. Later on struct namecache was tweaked to become
  * smaller and the value was bumped to retain the total size, but it
  * was never re-evaluated for suitability. A simple test counting
  * lengths during package building shows that the value of 45 covers
  * about 86% of all added entries, reaching 99% at 65.
  *
  * Regardless of the above, use of dedicated zones instead of malloc may be
  * inducing additional waste. This may be hard to address as said zones are
  * tied to VFS SMR. Even if retaining them, the current split should be
  * re-evaluated.
  */
 #ifdef __LP64__
 #define	CACHE_PATH_CUTOFF	45
 #define	CACHE_LARGE_PAD		6
 #else
 #define	CACHE_PATH_CUTOFF	41
 #define	CACHE_LARGE_PAD		2
 #endif
 
 #define CACHE_ZONE_SMALL_SIZE		(offsetof(struct namecache, nc_name) + CACHE_PATH_CUTOFF + 1)
 #define CACHE_ZONE_SMALL_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_SMALL_SIZE)
 #define CACHE_ZONE_LARGE_SIZE		(offsetof(struct namecache, nc_name) + NAME_MAX + 1 + CACHE_LARGE_PAD)
 #define CACHE_ZONE_LARGE_TS_SIZE	(offsetof(struct namecache_ts, nc_nc) + CACHE_ZONE_LARGE_SIZE)
 
 _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size");
 
 #define	nc_vp		n_un.nu_vp
 #define	nc_neg		n_un.nu_neg
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	0x01
 #define NCF_ISDOTDOT	0x02
 #define	NCF_TS		0x04
 #define	NCF_DTS		0x08
 #define	NCF_DVDROP	0x10
 #define	NCF_NEGATIVE	0x20
 #define	NCF_INVALID	0x40
 #define	NCF_WIP		0x80
 
 /*
  * Flags in negstate.neg_flag
  */
 #define NEG_HOT		0x01
 
 static bool	cache_neg_evict_cond(u_long lnumcache);
 
 /*
  * Mark an entry as invalid.
  *
  * This is called before it starts getting deconstructed.
  */
 static void
 cache_ncp_invalidate(struct namecache *ncp)
 {
 
 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
 	    ("%s: entry %p already invalid", __func__, ncp));
 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID);
 	atomic_thread_fence_rel();
 }
 
 /*
  * Check whether the entry can be safely used.
  *
  * All places which elide locks are supposed to call this after they are
  * done with reading from an entry.
  */
 #define cache_ncp_canuse(ncp)	({					\
 	struct namecache *_ncp = (ncp);					\
 	u_char _nc_flag;						\
 									\
 	atomic_thread_fence_acq();					\
 	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
 	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP)) == 0);	\
 })
 
 /*
  * Like the above but also checks NCF_WHITE.
  */
 #define cache_fpl_neg_ncp_canuse(ncp)	({				\
 	struct namecache *_ncp = (ncp);					\
 	u_char _nc_flag;						\
 									\
 	atomic_thread_fence_acq();					\
 	_nc_flag = atomic_load_char(&_ncp->nc_flag);			\
 	__predict_true((_nc_flag & (NCF_INVALID | NCF_WIP | NCF_WHITE)) == 0);	\
 })
 
 VFS_SMR_DECLARE;
 
 static SYSCTL_NODE(_vfs_cache, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache parameters");
 
 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, size, CTLFLAG_RD, &ncsize, 0,
     "Total namecache capacity");
 
 u_int ncsizefactor = 2;
 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, sizefactor, CTLFLAG_RW, &ncsizefactor, 0,
     "Size factor for namecache");
 
 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
 SYSCTL_ULONG(_vfs_cache_param, OID_AUTO, negfactor, CTLFLAG_RW, &ncnegfactor, 0,
     "Ratio of negative namecache entries");
 
 /*
  * Negative entry % of namecache capacity above which automatic eviction is allowed.
  *
  * Check cache_neg_evict_cond for details.
  */
 static u_int ncnegminpct = 3;
 
 static u_int __read_mostly     neg_min; /* the above recomputed against ncsize */
 SYSCTL_UINT(_vfs_cache_param, OID_AUTO, negmin, CTLFLAG_RD, &neg_min, 0,
     "Negative entry count above which automatic eviction is allowed");
 
 /*
  * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 static u_long __read_mostly	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
 
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
 static u_int __exclusive_cache_line neg_cycle;
 
 #define ncneghash	3
 #define	numneglists	(ncneghash + 1)
 
 struct neglist {
 	struct mtx		nl_evict_lock;
 	struct mtx		nl_lock __aligned(CACHE_LINE_SIZE);
 	TAILQ_HEAD(, namecache) nl_list;
 	TAILQ_HEAD(, namecache) nl_hotlist;
 	u_long			nl_hotnum;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct neglist neglists[numneglists];
 
 static inline struct neglist *
 NCP2NEGLIST(struct namecache *ncp)
 {
 
 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
 }
 
 static inline struct negstate *
 NCP2NEGSTATE(struct namecache *ncp)
 {
 
 	MPASS(atomic_load_char(&ncp->nc_flag) & NCF_NEGATIVE);
 	return (&ncp->nc_neg);
 }
 
 #define	numbucketlocks (ncbuckethash + 1)
 static u_int __read_mostly  ncbuckethash;
 static struct mtx_padalign __read_mostly  *bucketlocks;
 #define	HASH2BUCKETLOCK(hash) \
 	((struct mtx *)(&bucketlocks[((hash) & ncbuckethash)]))
 
 #define	numvnodelocks (ncvnodehash + 1)
 static u_int __read_mostly  ncvnodehash;
 static struct mtx __read_mostly *vnodelocks;
 static inline struct mtx *
 VP2VNODELOCK(struct vnode *vp)
 {
 
 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
 }
 
 static void
 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 {
 	struct namecache_ts *ncp_ts;
 
 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 	    (tsp == NULL && ticksp == NULL),
 	    ("No NCF_TS"));
 
 	if (tsp == NULL)
 		return;
 
 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 	*tsp = ncp_ts->nc_time;
 	*ticksp = ncp_ts->nc_ticks;
 }
 
 #ifdef DEBUG_CACHE
 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
     "VFS namecache enabled");
 #endif
 
 /* Export size information to userland */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
-    sizeof(struct namecache), "sizeof(struct namecache)");
+SYSCTL_SIZEOF_STRUCT(namecache);
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs_cache, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache statistics");
 
 #define STATNODE_ULONG(name, varname, descr)					\
 	SYSCTL_ULONG(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 #define STATNODE_COUNTER(name, varname, descr)					\
 	static COUNTER_U64_DEFINE_EARLY(varname);				\
 	SYSCTL_COUNTER_U64(_vfs_cache_stats, OID_AUTO, name, CTLFLAG_RD, &varname, \
 	    descr);
 STATNODE_ULONG(neg, numneg, "Number of negative cache entries");
 STATNODE_ULONG(count, numcache, "Number of cache entries");
 STATNODE_COUNTER(heldvnodes, numcachehv, "Number of namecache entries with vnodes held");
 STATNODE_COUNTER(drops, numdrops, "Number of dropped entries due to reaching the limit");
 STATNODE_COUNTER(miss, nummiss, "Number of cache misses");
 STATNODE_COUNTER(misszap, nummisszap, "Number of cache misses we do not want to cache");
 STATNODE_COUNTER(poszaps, numposzaps,
     "Number of cache hits (positive) we do not want to cache");
 STATNODE_COUNTER(poshits, numposhits, "Number of cache hits (positive)");
 STATNODE_COUNTER(negzaps, numnegzaps,
     "Number of cache hits (negative) we do not want to cache");
 STATNODE_COUNTER(neghits, numneghits, "Number of cache hits (negative)");
 /* These count for vn_getcwd(), too. */
 STATNODE_COUNTER(fullpathcalls, numfullpathcalls, "Number of fullpath search calls");
 STATNODE_COUNTER(fullpathfail2, numfullpathfail2,
     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 STATNODE_COUNTER(fullpathfail4, numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 STATNODE_COUNTER(fullpathfound, numfullpathfound, "Number of successful fullpath calls");
 STATNODE_COUNTER(symlinktoobig, symlinktoobig, "Number of times symlink did not fit the cache");
 
 /*
  * Debug or developer statistics.
  */
 static SYSCTL_NODE(_vfs_cache, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache debugging");
 #define DEBUGNODE_ULONG(name, varname, descr)					\
 	SYSCTL_ULONG(_vfs_cache_debug, OID_AUTO, name, CTLFLAG_RD, &varname, 0, descr);
 static u_long zap_bucket_relock_success;
 DEBUGNODE_ULONG(zap_bucket_relock_success, zap_bucket_relock_success,
     "Number of successful removals after relocking");
 static u_long zap_bucket_fail;
 DEBUGNODE_ULONG(zap_bucket_fail, zap_bucket_fail, "");
 static u_long zap_bucket_fail2;
 DEBUGNODE_ULONG(zap_bucket_fail2, zap_bucket_fail2, "");
 static u_long cache_lock_vnodes_cel_3_failures;
 DEBUGNODE_ULONG(vnodes_cel_3_failures, cache_lock_vnodes_cel_3_failures,
     "Number of times 3-way vnode locking failed");
 
 static void cache_zap_locked(struct namecache *ncp);
 static int vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
     char **retbuf, size_t *buflen, size_t addend);
 static int vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf,
     char **retbuf, size_t *buflen);
 static int vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf,
     char **retbuf, size_t *len, size_t addend);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 static inline void
 cache_assert_vlp_locked(struct mtx *vlp)
 {
 
 	if (vlp != NULL)
 		mtx_assert(vlp, MA_OWNED);
 }
 
 static inline void
 cache_assert_vnode_locked(struct vnode *vp)
 {
 	struct mtx *vlp;
 
 	vlp = VP2VNODELOCK(vp);
 	cache_assert_vlp_locked(vlp);
 }
 
 /*
  * Directory vnodes with entries are held for two reasons:
  * 1. make them less of a target for reclamation in vnlru
  * 2. suffer smaller performance penalty in locked lookup as requeieing is avoided
  *
  * It will be feasible to stop doing it altogether if all filesystems start
  * supporting lockless lookup.
  */
 static void
 cache_hold_vnode(struct vnode *vp)
 {
 
 	cache_assert_vnode_locked(vp);
 	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 	vhold(vp);
 	counter_u64_add(numcachehv, 1);
 }
 
 static void
 cache_drop_vnode(struct vnode *vp)
 {
 
 	/*
 	 * Called after all locks are dropped, meaning we can't assert
 	 * on the state of v_cache_src.
 	 */
 	vdrop(vp);
 	counter_u64_add(numcachehv, -1);
 }
 
 /*
  * UMA zones.
  */
 static uma_zone_t __read_mostly cache_zone_small;
 static uma_zone_t __read_mostly cache_zone_small_ts;
 static uma_zone_t __read_mostly cache_zone_large;
 static uma_zone_t __read_mostly cache_zone_large_ts;
 
 char *
 cache_symlink_alloc(size_t size, int flags)
 {
 
 	if (size < CACHE_ZONE_SMALL_SIZE) {
 		return (uma_zalloc_smr(cache_zone_small, flags));
 	}
 	if (size < CACHE_ZONE_LARGE_SIZE) {
 		return (uma_zalloc_smr(cache_zone_large, flags));
 	}
 	counter_u64_add(symlinktoobig, 1);
 	SDT_PROBE1(vfs, namecache, symlink, alloc__fail, size);
 	return (NULL);
 }
 
 void
 cache_symlink_free(char *string, size_t size)
 {
 
 	MPASS(string != NULL);
 	KASSERT(size < CACHE_ZONE_LARGE_SIZE,
 	    ("%s: size %zu too big", __func__, size));
 
 	if (size < CACHE_ZONE_SMALL_SIZE) {
 		uma_zfree_smr(cache_zone_small, string);
 		return;
 	}
 	if (size < CACHE_ZONE_LARGE_SIZE) {
 		uma_zfree_smr(cache_zone_large, string);
 		return;
 	}
 	__assert_unreachable();
 }
 
 static struct namecache *
 cache_alloc_uma(int len, bool ts)
 {
 	struct namecache_ts *ncp_ts;
 	struct namecache *ncp;
 
 	if (__predict_false(ts)) {
 		if (len <= CACHE_PATH_CUTOFF)
 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 		else
 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 		ncp = &ncp_ts->nc_nc;
 	} else {
 		if (len <= CACHE_PATH_CUTOFF)
 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 		else
 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 	}
 	return (ncp);
 }
 
 static void
 cache_free_uma(struct namecache *ncp)
 {
 	struct namecache_ts *ncp_ts;
 
 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 		else
 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 	} else {
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
 			uma_zfree_smr(cache_zone_small, ncp);
 		else
 			uma_zfree_smr(cache_zone_large, ncp);
 	}
 }
 
 static struct namecache *
 cache_alloc(int len, bool ts)
 {
 	u_long lnumcache;
 
 	/*
 	 * Avoid blowout in namecache entries.
 	 *
 	 * Bugs:
 	 * 1. filesystems may end up trying to add an already existing entry
 	 * (for example this can happen after a cache miss during concurrent
 	 * lookup), in which case we will call cache_neg_evict despite not
 	 * adding anything.
 	 * 2. the routine may fail to free anything and no provisions are made
 	 * to make it try harder (see the inside for failure modes)
 	 * 3. it only ever looks at negative entries.
 	 */
 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
 	if (cache_neg_evict_cond(lnumcache)) {
 		lnumcache = atomic_load_long(&numcache);
 	}
 	if (__predict_false(lnumcache >= ncsize)) {
 		atomic_subtract_long(&numcache, 1);
 		counter_u64_add(numdrops, 1);
 		return (NULL);
 	}
 	return (cache_alloc_uma(len, ts));
 }
 
 static void
 cache_free(struct namecache *ncp)
 {
 
 	MPASS(ncp != NULL);
 	if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 		cache_drop_vnode(ncp->nc_dvp);
 	}
 	cache_free_uma(ncp);
 	atomic_subtract_long(&numcache, 1);
 }
 
 static void
 cache_free_batch(struct cache_freebatch *batch)
 {
 	struct namecache *ncp, *nnp;
 	int i;
 
 	i = 0;
 	if (TAILQ_EMPTY(batch))
 		goto out;
 	TAILQ_FOREACH_SAFE(ncp, batch, nc_dst, nnp) {
 		if ((ncp->nc_flag & NCF_DVDROP) != 0) {
 			cache_drop_vnode(ncp->nc_dvp);
 		}
 		cache_free_uma(ncp);
 		i++;
 	}
 	atomic_subtract_long(&numcache, i);
 out:
 	SDT_PROBE1(vfs, namecache, purge, batch, i);
 }
 
 /*
  * Hashing.
  *
  * The code was made to use FNV in 2001 and this choice needs to be revisited.
  *
  * Short summary of the difficulty:
  * The longest name which can be inserted is NAME_MAX characters in length (or
  * 255 at the time of writing this comment), while majority of names used in
  * practice are significantly shorter (mostly below 10). More importantly
  * majority of lookups performed find names are even shorter than that.
  *
  * This poses a problem where hashes which do better than FNV past word size
  * (or so) tend to come with additional overhead when finalizing the result,
  * making them noticeably slower for the most commonly used range.
  *
  * Consider a path like: /usr/obj/usr/src/sys/amd64/GENERIC/vnode_if.c
  *
  * When looking it up the most time consuming part by a large margin (at least
  * on amd64) is hashing.  Replacing FNV with something which pessimizes short
  * input would make the slowest part stand out even more.
  */
 
 /*
  * TODO: With the value stored we can do better than computing the hash based
  * on the address.
  */
 static void
 cache_prehash(struct vnode *vp)
 {
 
 	vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT);
 }
 
 static uint32_t
 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 {
 
 	return (fnv_32_buf(name, len, dvp->v_nchash));
 }
 
 static uint32_t
 cache_get_hash_iter_start(struct vnode *dvp)
 {
 
 	return (dvp->v_nchash);
 }
 
 static uint32_t
 cache_get_hash_iter(char c, uint32_t hash)
 {
 
 	return (fnv_32_buf(&c, 1, hash));
 }
 
 static uint32_t
 cache_get_hash_iter_finish(uint32_t hash)
 {
 
 	return (hash);
 }
 
 static inline struct nchashhead *
 NCP2BUCKET(struct namecache *ncp)
 {
 	uint32_t hash;
 
 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 	return (NCHHASH(hash));
 }
 
 static inline struct mtx *
 NCP2BUCKETLOCK(struct namecache *ncp)
 {
 	uint32_t hash;
 
 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
 	return (HASH2BUCKETLOCK(hash));
 }
 
 #ifdef INVARIANTS
 static void
 cache_assert_bucket_locked(struct namecache *ncp)
 {
 	struct mtx *blp;
 
 	blp = NCP2BUCKETLOCK(ncp);
 	mtx_assert(blp, MA_OWNED);
 }
 
 static void
 cache_assert_bucket_unlocked(struct namecache *ncp)
 {
 	struct mtx *blp;
 
 	blp = NCP2BUCKETLOCK(ncp);
 	mtx_assert(blp, MA_NOTOWNED);
 }
 #else
 #define cache_assert_bucket_locked(x) do { } while (0)
 #define cache_assert_bucket_unlocked(x) do { } while (0)
 #endif
 
 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
 static void
 _cache_sort_vnodes(void **p1, void **p2)
 {
 	void *tmp;
 
 	MPASS(*p1 != NULL || *p2 != NULL);
 
 	if (*p1 > *p2) {
 		tmp = *p2;
 		*p2 = *p1;
 		*p1 = tmp;
 	}
 }
 
 static void
 cache_lock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		mtx_lock(&bucketlocks[i]);
 }
 
 static void
 cache_unlock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		mtx_unlock(&bucketlocks[i]);
 }
 
 static void
 cache_lock_all_vnodes(void)
 {
 	u_int i;
 
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_lock(&vnodelocks[i]);
 }
 
 static void
 cache_unlock_all_vnodes(void)
 {
 	u_int i;
 
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_unlock(&vnodelocks[i]);
 }
 
 static int
 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	cache_sort_vnodes(&vlp1, &vlp2);
 
 	if (vlp1 != NULL) {
 		if (!mtx_trylock(vlp1))
 			return (EAGAIN);
 	}
 	if (!mtx_trylock(vlp2)) {
 		if (vlp1 != NULL)
 			mtx_unlock(vlp1);
 		return (EAGAIN);
 	}
 
 	return (0);
 }
 
 static void
 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	MPASS(vlp1 != NULL || vlp2 != NULL);
 	MPASS(vlp1 <= vlp2);
 
 	if (vlp1 != NULL)
 		mtx_lock(vlp1);
 	if (vlp2 != NULL)
 		mtx_lock(vlp2);
 }
 
 static void
 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
 {
 
 	MPASS(vlp1 != NULL || vlp2 != NULL);
 
 	if (vlp1 != NULL)
 		mtx_unlock(vlp1);
 	if (vlp2 != NULL)
 		mtx_unlock(vlp2);
 }
 
 static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
 	struct nchstats snap;
 
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
 
 	snap = nchstats;
 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
 	snap.ncs_neghits = counter_u64_fetch(numneghits);
 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 	    counter_u64_fetch(numnegzaps);
 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
 	    counter_u64_fetch(nummiss);
 
 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 }
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
     "VFS cache effectiveness statistics");
 
 static int
 sysctl_hitpct(SYSCTL_HANDLER_ARGS)
 {
 	long poshits, neghits, miss, total;
 	long pct;
 
 	poshits = counter_u64_fetch(numposhits);
 	neghits = counter_u64_fetch(numneghits);
 	miss = counter_u64_fetch(nummiss);
 	total = poshits + neghits + miss;
 
 	pct = 0;
 	if (total != 0)
 		pct = ((poshits + neghits) * 100) / total;
 	return (sysctl_handle_int(oidp, 0, pct, req));
 }
 SYSCTL_PROC(_vfs_cache_stats, OID_AUTO, hitpct,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_hitpct,
     "I", "Percentage of hits");
 
 static void
 cache_recalc_neg_min(void)
 {
 
 	neg_min = (ncsize * ncnegminpct) / 100;
 }
 
 static int
 sysctl_negminpct(SYSCTL_HANDLER_ARGS)
 {
 	u_int val;
 	int error;
 
 	val = ncnegminpct;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (val == ncnegminpct)
 		return (0);
 	if (val < 0 || val > 99)
 		return (EINVAL);
 	ncnegminpct = val;
 	cache_recalc_neg_min();
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, negminpct,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_negminpct,
     "I", "Negative entry \% of namecache capacity above which automatic eviction is allowed");
 
 #ifdef DEBUG_CACHE
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
     "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int i, error, n_nchash, *cntbuf;
 
 retry:
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (req->oldptr == NULL)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
 	cache_lock_all_buckets();
 	if (n_nchash != nchash + 1) {
 		cache_unlock_all_buckets();
 		free(cntbuf, M_TEMP);
 		goto retry;
 	}
 	/* Scan hash tables counting entries */
 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash)
 			cntbuf[i]++;
 	cache_unlock_all_buckets();
 	for (error = 0, i = 0; i < n_nchash; i++)
 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 			break;
 	free(cntbuf, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
     "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	cache_lock_all_buckets();
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		CK_SLIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	cache_unlock_all_buckets();
 	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
  * Negative entries management
  *
  * Various workloads create plenty of negative entries and barely use them
  * afterwards. Moreover malicious users can keep performing bogus lookups
  * adding even more entries. For example "make tinderbox" as of writing this
  * comment ends up with 2.6M namecache entries in total, 1.2M of which are
  * negative.
  *
  * As such, a rather aggressive eviction method is needed. The currently
  * employed method is a placeholder.
  *
  * Entries are split over numneglists separate lists, each of which is further
  * split into hot and cold entries. Entries get promoted after getting a hit.
  * Eviction happens on addition of new entry.
  */
 static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Name cache negative entry statistics");
 
 SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
     "Number of negative cache entries");
 
 static COUNTER_U64_DEFINE_EARLY(neg_created);
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
     "Number of created negative entries");
 
 static COUNTER_U64_DEFINE_EARLY(neg_evicted);
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
     "Number of evicted negative entries");
 
 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
     &neg_evict_skipped_empty,
     "Number of times evicting failed due to lack of entries");
 
 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_missed);
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_missed, CTLFLAG_RD,
     &neg_evict_skipped_missed,
     "Number of times evicting failed due to target entry disappearing");
 
 static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
     &neg_evict_skipped_contended,
     "Number of times evicting failed due to contention");
 
 SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
     "Number of cache hits (negative)");
 
 static int
 sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
 {
 	int i, out;
 
 	out = 0;
 	for (i = 0; i < numneglists; i++)
 		out += neglists[i].nl_hotnum;
 
 	return (SYSCTL_OUT(req, &out, sizeof(out)));
 }
 SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
     "Number of hot negative entries");
 
 static void
 cache_neg_init(struct namecache *ncp)
 {
 	struct negstate *ns;
 
 	ncp->nc_flag |= NCF_NEGATIVE;
 	ns = NCP2NEGSTATE(ncp);
 	ns->neg_flag = 0;
 	ns->neg_hit = 0;
 	counter_u64_add(neg_created, 1);
 }
 
 #define CACHE_NEG_PROMOTION_THRESH 2
 
 static bool
 cache_neg_hit_prep(struct namecache *ncp)
 {
 	struct negstate *ns;
 	u_char n;
 
 	ns = NCP2NEGSTATE(ncp);
 	n = atomic_load_char(&ns->neg_hit);
 	for (;;) {
 		if (n >= CACHE_NEG_PROMOTION_THRESH)
 			return (false);
 		if (atomic_fcmpset_8(&ns->neg_hit, &n, n + 1))
 			break;
 	}
 	return (n + 1 == CACHE_NEG_PROMOTION_THRESH);
 }
 
 /*
  * Nothing to do here but it is provided for completeness as some
  * cache_neg_hit_prep callers may end up returning without even
  * trying to promote.
  */
 #define cache_neg_hit_abort(ncp)	do { } while (0)
 
 static void
 cache_neg_hit_finish(struct namecache *ncp)
 {
 
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, ncp->nc_dvp, ncp->nc_name);
 	counter_u64_add(numneghits, 1);
 }
 
 /*
  * Move a negative entry to the hot list.
  */
 static void
 cache_neg_promote_locked(struct namecache *ncp)
 {
 	struct neglist *nl;
 	struct negstate *ns;
 
 	ns = NCP2NEGSTATE(ncp);
 	nl = NCP2NEGLIST(ncp);
 	mtx_assert(&nl->nl_lock, MA_OWNED);
 	if ((ns->neg_flag & NEG_HOT) == 0) {
 		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 		TAILQ_INSERT_TAIL(&nl->nl_hotlist, ncp, nc_dst);
 		nl->nl_hotnum++;
 		ns->neg_flag |= NEG_HOT;
 	}
 }
 
 /*
  * Move a hot negative entry to the cold list.
  */
 static void
 cache_neg_demote_locked(struct namecache *ncp)
 {
 	struct neglist *nl;
 	struct negstate *ns;
 
 	ns = NCP2NEGSTATE(ncp);
 	nl = NCP2NEGLIST(ncp);
 	mtx_assert(&nl->nl_lock, MA_OWNED);
 	MPASS(ns->neg_flag & NEG_HOT);
 	TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 	nl->nl_hotnum--;
 	ns->neg_flag &= ~NEG_HOT;
 	atomic_store_char(&ns->neg_hit, 0);
 }
 
 /*
  * Move a negative entry to the hot list if it matches the lookup.
  *
  * We have to take locks, but they may be contended and in the worst
  * case we may need to go off CPU. We don't want to spin within the
  * smr section and we can't block with it. Exiting the section means
  * the found entry could have been evicted. We are going to look it
  * up again.
  */
 static bool
 cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
     struct namecache *oncp, uint32_t hash)
 {
 	struct namecache *ncp;
 	struct neglist *nl;
 	u_char nc_flag;
 
 	nl = NCP2NEGLIST(oncp);
 
 	mtx_lock(&nl->nl_lock);
 	/*
 	 * For hash iteration.
 	 */
 	vfs_smr_enter();
 
 	/*
 	 * Avoid all surprises by only succeeding if we got the same entry and
 	 * bailing completely otherwise.
 	 * XXX There are no provisions to keep the vnode around, meaning we may
 	 * end up promoting a negative entry for a *new* vnode and returning
 	 * ENOENT on its account. This is the error we want to return anyway
 	 * and promotion is harmless.
 	 *
 	 * In particular at this point there can be a new ncp which matches the
 	 * search but hashes to a different neglist.
 	 */
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp == oncp)
 			break;
 	}
 
 	/*
 	 * No match to begin with.
 	 */
 	if (__predict_false(ncp == NULL)) {
 		goto out_abort;
 	}
 
 	/*
 	 * The newly found entry may be something different...
 	 */
 	if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 	    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) {
 		goto out_abort;
 	}
 
 	/*
 	 * ... and not even negative.
 	 */
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_NEGATIVE) == 0) {
 		goto out_abort;
 	}
 
 	if (!cache_ncp_canuse(ncp)) {
 		goto out_abort;
 	}
 
 	cache_neg_promote_locked(ncp);
 	cache_neg_hit_finish(ncp);
 	vfs_smr_exit();
 	mtx_unlock(&nl->nl_lock);
 	return (true);
 out_abort:
 	vfs_smr_exit();
 	mtx_unlock(&nl->nl_lock);
 	return (false);
 }
 
 static void
 cache_neg_promote(struct namecache *ncp)
 {
 	struct neglist *nl;
 
 	nl = NCP2NEGLIST(ncp);
 	mtx_lock(&nl->nl_lock);
 	cache_neg_promote_locked(ncp);
 	mtx_unlock(&nl->nl_lock);
 }
 
 static void
 cache_neg_insert(struct namecache *ncp)
 {
 	struct neglist *nl;
 
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	cache_assert_bucket_locked(ncp);
 	nl = NCP2NEGLIST(ncp);
 	mtx_lock(&nl->nl_lock);
 	TAILQ_INSERT_TAIL(&nl->nl_list, ncp, nc_dst);
 	mtx_unlock(&nl->nl_lock);
 	atomic_add_long(&numneg, 1);
 }
 
 static void
 cache_neg_remove(struct namecache *ncp)
 {
 	struct neglist *nl;
 	struct negstate *ns;
 
 	cache_assert_bucket_locked(ncp);
 	nl = NCP2NEGLIST(ncp);
 	ns = NCP2NEGSTATE(ncp);
 	mtx_lock(&nl->nl_lock);
 	if ((ns->neg_flag & NEG_HOT) != 0) {
 		TAILQ_REMOVE(&nl->nl_hotlist, ncp, nc_dst);
 		nl->nl_hotnum--;
 	} else {
 		TAILQ_REMOVE(&nl->nl_list, ncp, nc_dst);
 	}
 	mtx_unlock(&nl->nl_lock);
 	atomic_subtract_long(&numneg, 1);
 }
 
 static struct neglist *
 cache_neg_evict_select_list(void)
 {
 	struct neglist *nl;
 	u_int c;
 
 	c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
 	nl = &neglists[c % numneglists];
 	if (!mtx_trylock(&nl->nl_evict_lock)) {
 		counter_u64_add(neg_evict_skipped_contended, 1);
 		return (NULL);
 	}
 	return (nl);
 }
 
 static struct namecache *
 cache_neg_evict_select_entry(struct neglist *nl)
 {
 	struct namecache *ncp, *lncp;
 	struct negstate *ns, *lns;
 	int i;
 
 	mtx_assert(&nl->nl_evict_lock, MA_OWNED);
 	mtx_assert(&nl->nl_lock, MA_OWNED);
 	ncp = TAILQ_FIRST(&nl->nl_list);
 	if (ncp == NULL)
 		return (NULL);
 	lncp = ncp;
 	lns = NCP2NEGSTATE(lncp);
 	for (i = 1; i < 4; i++) {
 		ncp = TAILQ_NEXT(ncp, nc_dst);
 		if (ncp == NULL)
 			break;
 		ns = NCP2NEGSTATE(ncp);
 		if (ns->neg_hit < lns->neg_hit) {
 			lncp = ncp;
 			lns = ns;
 		}
 	}
 	return (lncp);
 }
 
 static bool
 cache_neg_evict(void)
 {
 	struct namecache *ncp, *ncp2;
 	struct neglist *nl;
 	struct vnode *dvp;
 	struct mtx *dvlp;
 	struct mtx *blp;
 	uint32_t hash;
 	u_char nlen;
 	bool evicted;
 
 	nl = cache_neg_evict_select_list();
 	if (nl == NULL) {
 		return (false);
 	}
 
 	mtx_lock(&nl->nl_lock);
 	ncp = TAILQ_FIRST(&nl->nl_hotlist);
 	if (ncp != NULL) {
 		cache_neg_demote_locked(ncp);
 	}
 	ncp = cache_neg_evict_select_entry(nl);
 	if (ncp == NULL) {
 		counter_u64_add(neg_evict_skipped_empty, 1);
 		mtx_unlock(&nl->nl_lock);
 		mtx_unlock(&nl->nl_evict_lock);
 		return (false);
 	}
 	nlen = ncp->nc_nlen;
 	dvp = ncp->nc_dvp;
 	hash = cache_get_hash(ncp->nc_name, nlen, dvp);
 	dvlp = VP2VNODELOCK(dvp);
 	blp = HASH2BUCKETLOCK(hash);
 	mtx_unlock(&nl->nl_lock);
 	mtx_unlock(&nl->nl_evict_lock);
 	mtx_lock(dvlp);
 	mtx_lock(blp);
 	/*
 	 * Note that since all locks were dropped above, the entry may be
 	 * gone or reallocated to be something else.
 	 */
 	CK_SLIST_FOREACH(ncp2, (NCHHASH(hash)), nc_hash) {
 		if (ncp2 == ncp && ncp2->nc_dvp == dvp &&
 		    ncp2->nc_nlen == nlen && (ncp2->nc_flag & NCF_NEGATIVE) != 0)
 			break;
 	}
 	if (ncp2 == NULL) {
 		counter_u64_add(neg_evict_skipped_missed, 1);
 		ncp = NULL;
 		evicted = false;
 	} else {
 		MPASS(dvlp == VP2VNODELOCK(ncp->nc_dvp));
 		MPASS(blp == NCP2BUCKETLOCK(ncp));
 		SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
 		    ncp->nc_name);
 		cache_zap_locked(ncp);
 		counter_u64_add(neg_evicted, 1);
 		evicted = true;
 	}
 	mtx_unlock(blp);
 	mtx_unlock(dvlp);
 	if (ncp != NULL)
 		cache_free(ncp);
 	return (evicted);
 }
 
 /*
  * Maybe evict a negative entry to create more room.
  *
  * The ncnegfactor parameter limits what fraction of the total count
  * can comprise of negative entries. However, if the cache is just
  * warming up this leads to excessive evictions.  As such, ncnegminpct
  * (recomputed to neg_min) dictates whether the above should be
  * applied.
  *
  * Try evicting if the cache is close to full capacity regardless of
  * other considerations.
  */
 static bool
 cache_neg_evict_cond(u_long lnumcache)
 {
 	u_long lnumneg;
 
 	if (ncsize - 1000 < lnumcache)
 		goto out_evict;
 	lnumneg = atomic_load_long(&numneg);
 	if (lnumneg < neg_min)
 		return (false);
 	if (lnumneg * ncnegfactor < lnumcache)
 		return (false);
 out_evict:
 	return (cache_neg_evict());
 }
 
 /*
  * cache_zap_locked():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
 cache_zap_locked(struct namecache *ncp)
 {
 	struct nchashhead *ncpp;
 	struct vnode *dvp, *vp;
 
 	dvp = ncp->nc_dvp;
 	vp = ncp->nc_vp;
 
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		cache_assert_vnode_locked(vp);
 	cache_assert_vnode_locked(dvp);
 	cache_assert_bucket_locked(ncp);
 
 	cache_ncp_invalidate(ncp);
 
 	ncpp = NCP2BUCKET(ncp);
 	CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash);
 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 		SDT_PROBE3(vfs, namecache, zap, done, dvp, ncp->nc_name, vp);
 		TAILQ_REMOVE(&vp->v_cache_dst, ncp, nc_dst);
 		if (ncp == vp->v_cache_dd) {
 			atomic_store_ptr(&vp->v_cache_dd, NULL);
 		}
 	} else {
 		SDT_PROBE2(vfs, namecache, zap_negative, done, dvp, ncp->nc_name);
 		cache_neg_remove(ncp);
 	}
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
 		if (ncp == dvp->v_cache_dd) {
 			atomic_store_ptr(&dvp->v_cache_dd, NULL);
 		}
 	} else {
 		LIST_REMOVE(ncp, nc_src);
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			ncp->nc_flag |= NCF_DVDROP;
 		}
 	}
 }
 
 static void
 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 {
 	struct mtx *blp;
 
 	MPASS(ncp->nc_dvp == vp);
 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
 	cache_assert_vnode_locked(vp);
 
 	blp = NCP2BUCKETLOCK(ncp);
 	mtx_lock(blp);
 	cache_zap_locked(ncp);
 	mtx_unlock(blp);
 }
 
 static bool
 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
     struct mtx **vlpp)
 {
 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
 	struct mtx *blp;
 
 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
 	cache_assert_vnode_locked(vp);
 
 	if (ncp->nc_flag & NCF_NEGATIVE) {
 		if (*vlpp != NULL) {
 			mtx_unlock(*vlpp);
 			*vlpp = NULL;
 		}
 		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		return (true);
 	}
 
 	pvlp = VP2VNODELOCK(vp);
 	blp = NCP2BUCKETLOCK(ncp);
 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
 
 	if (*vlpp == vlp1 || *vlpp == vlp2) {
 		to_unlock = *vlpp;
 		*vlpp = NULL;
 	} else {
 		if (*vlpp != NULL) {
 			mtx_unlock(*vlpp);
 			*vlpp = NULL;
 		}
 		cache_sort_vnodes(&vlp1, &vlp2);
 		if (vlp1 == pvlp) {
 			mtx_lock(vlp2);
 			to_unlock = vlp2;
 		} else {
 			if (!mtx_trylock(vlp1))
 				goto out_relock;
 			to_unlock = vlp1;
 		}
 	}
 	mtx_lock(blp);
 	cache_zap_locked(ncp);
 	mtx_unlock(blp);
 	if (to_unlock != NULL)
 		mtx_unlock(to_unlock);
 	return (true);
 
 out_relock:
 	mtx_unlock(vlp2);
 	mtx_lock(vlp1);
 	mtx_lock(vlp2);
 	MPASS(*vlpp == NULL);
 	*vlpp = vlp1;
 	return (false);
 }
 
 /*
  * If trylocking failed we can get here. We know enough to take all needed locks
  * in the right order and re-lookup the entry.
  */
 static int
 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
     struct mtx *blp)
 {
 	struct namecache *rncp;
 	struct mtx *rvlp;
 
 	cache_assert_bucket_unlocked(ncp);
 
 	cache_sort_vnodes(&dvlp, &vlp);
 	cache_lock_vnodes(dvlp, vlp);
 	mtx_lock(blp);
 	CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 		if (rncp == ncp && rncp->nc_dvp == dvp &&
 		    rncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
 			break;
 	}
 
 	if (rncp == NULL)
 		goto out_mismatch;
 
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		rvlp = VP2VNODELOCK(rncp->nc_vp);
 	else
 		rvlp = NULL;
 	if (rvlp != vlp)
 		goto out_mismatch;
 
 	cache_zap_locked(rncp);
 	mtx_unlock(blp);
 	cache_unlock_vnodes(dvlp, vlp);
 	atomic_add_long(&zap_bucket_relock_success, 1);
 	return (0);
 
 out_mismatch:
 	mtx_unlock(blp);
 	cache_unlock_vnodes(dvlp, vlp);
 	return (EAGAIN);
 }
 
 static int __noinline
 cache_zap_locked_bucket(struct namecache *ncp, struct componentname *cnp,
     uint32_t hash, struct mtx *blp)
 {
 	struct mtx *dvlp, *vlp;
 	struct vnode *dvp;
 
 	cache_assert_bucket_locked(ncp);
 
 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
 	vlp = NULL;
 	if (!(ncp->nc_flag & NCF_NEGATIVE))
 		vlp = VP2VNODELOCK(ncp->nc_vp);
 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
 		cache_zap_locked(ncp);
 		mtx_unlock(blp);
 		cache_unlock_vnodes(dvlp, vlp);
 		return (0);
 	}
 
 	dvp = ncp->nc_dvp;
 	mtx_unlock(blp);
 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
 }
 
 static __noinline int
 cache_remove_cnp(struct vnode *dvp, struct componentname *cnp)
 {
 	struct namecache *ncp;
 	struct mtx *blp;
 	struct mtx *dvlp, *dvlp2;
 	uint32_t hash;
 	int error;
 
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
 		dvlp = VP2VNODELOCK(dvp);
 		dvlp2 = NULL;
 		mtx_lock(dvlp);
 retry_dotdot:
 		ncp = dvp->v_cache_dd;
 		if (ncp == NULL) {
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 			SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 			return (0);
 		}
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 			if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2))
 				goto retry_dotdot;
 			MPASS(dvp->v_cache_dd == NULL);
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 			cache_free(ncp);
 		} else {
 			atomic_store_ptr(&dvp->v_cache_dd, NULL);
 			mtx_unlock(dvlp);
 			if (dvlp2 != NULL)
 				mtx_unlock(dvlp2);
 		}
 		SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 		return (1);
 	}
 
 	/*
 	 * XXX note that access here is completely unlocked with no provisions
 	 * to keep the hash allocated. If one is sufficiently unlucky a
 	 * parallel cache resize can reallocate the hash, unmap backing pages
 	 * and cause the empty check below to fault.
 	 *
 	 * Fixing this has epsilon priority, but can be done with no overhead
 	 * for this codepath with sufficient effort.
 	 */
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	blp = HASH2BUCKETLOCK(hash);
 retry:
 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
 		goto out_no_entry;
 
 	mtx_lock(blp);
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	if (ncp == NULL) {
 		mtx_unlock(blp);
 		goto out_no_entry;
 	}
 
 	error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 	if (__predict_false(error != 0)) {
 		atomic_add_long(&zap_bucket_fail, 1);
 		goto retry;
 	}
 	counter_u64_add(numposzaps, 1);
 	SDT_PROBE2(vfs, namecache, removecnp, hit, dvp, cnp);
 	cache_free(ncp);
 	return (1);
 out_no_entry:
 	counter_u64_add(nummisszap, 1);
 	SDT_PROBE2(vfs, namecache, removecnp, miss, dvp, cnp);
 	return (0);
 }
 
 static int __noinline
 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	int ltype;
 
 	*vpp = dvp;
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 	if (tsp != NULL)
 		timespecclear(tsp);
 	if (ticksp != NULL)
 		*ticksp = ticks;
 	vrefact(*vpp);
 	/*
 	 * When we lookup "." we still can be asked to lock it
 	 * differently...
 	 */
 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 	if (ltype != VOP_ISLOCKED(*vpp)) {
 		if (ltype == LK_EXCLUSIVE) {
 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 			if (VN_IS_DOOMED((*vpp))) {
 				/* forced unmount */
 				vrele(*vpp);
 				*vpp = NULL;
 				return (ENOENT);
 			}
 		} else
 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 	}
 	return (-1);
 }
 
 static int __noinline
 cache_lookup_dotdot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	struct namecache_ts *ncp_ts;
 	struct namecache *ncp;
 	struct mtx *dvlp;
 	enum vgetstate vs;
 	int error, ltype;
 	bool whiteout;
 
 	MPASS((cnp->cn_flags & ISDOTDOT) != 0);
 
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		cache_remove_cnp(dvp, cnp);
 		return (0);
 	}
 
 retry:
 	dvlp = VP2VNODELOCK(dvp);
 	mtx_lock(dvlp);
 	ncp = dvp->v_cache_dd;
 	if (ncp == NULL) {
 		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, "..");
 		mtx_unlock(dvlp);
 		return (0);
 	}
 	if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
 		if (ncp->nc_flag & NCF_NEGATIVE)
 			*vpp = NULL;
 		else
 			*vpp = ncp->nc_vp;
 	} else
 		*vpp = ncp->nc_dvp;
 	if (*vpp == NULL)
 		goto negative_success;
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp);
 	cache_out_ts(ncp, tsp, ticksp);
 	if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 	    NCF_DTS && tsp != NULL) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		*tsp = ncp_ts->nc_dotdottime;
 	}
 
 	MPASS(dvp != *vpp);
 	ltype = VOP_ISLOCKED(dvp);
 	VOP_UNLOCK(dvp);
 	vs = vget_prep(*vpp);
 	mtx_unlock(dvlp);
 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 	vn_lock(dvp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(dvp)) {
 		if (error == 0)
 			vput(*vpp);
 		*vpp = NULL;
 		return (ENOENT);
 	}
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	return (-1);
 negative_success:
 	if (__predict_false(cnp->cn_nameiop == CREATE)) {
 		if (cnp->cn_flags & ISLASTCN) {
 			counter_u64_add(numnegzaps, 1);
 			cache_zap_negative_locked_vnode_kl(ncp, dvp);
 			mtx_unlock(dvlp);
 			cache_free(ncp);
 			return (0);
 		}
 	}
 
 	whiteout = (ncp->nc_flag & NCF_WHITE);
 	cache_out_ts(ncp, tsp, ticksp);
 	if (cache_neg_hit_prep(ncp))
 		cache_neg_promote(ncp);
 	else
 		cache_neg_hit_finish(ncp);
 	mtx_unlock(dvlp);
 	if (whiteout)
 		cnp->cn_flags |= ISWHITEOUT;
 	return (ENOENT);
 }
 
 /**
  * Lookup a name in the name cache
  *
  * # Arguments
  *
  * - dvp:	Parent directory in which to search.
  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
  * - cnp:	Parameters of the name search.  The most interesting bits of
  *   		the cn_flags field have the following meanings:
  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
  *   			it up.
  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
  * - tsp:	Return storage for cache timestamp.  On a successful (positive
  *   		or negative) lookup, tsp will be filled with any timespec that
  *   		was stored when this cache entry was created.  However, it will
  *   		be clear for "." entries.
  * - ticks:	Return storage for alternate cache timestamp.  On a successful
  *   		(positive or negative) lookup, it will contain the ticks value
  *   		that was current when the cache entry was created, unless cnp
  *   		was ".".
  *
  * Either both tsp and ticks have to be provided or neither of them.
  *
  * # Returns
  *
  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
  *		to a forced unmount.  vpp will not be modified.  If the entry
  *		is a whiteout, then the ISWHITEOUT flag will be set in
  *		cnp->cn_flags.
  * - 0:		A cache miss.  vpp will not be modified.
  *
  * # Locking
  *
  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
  * lock is not recursively acquired.
  */
 static int __noinline
 cache_lookup_fallback(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	struct namecache *ncp;
 	struct mtx *blp;
 	uint32_t hash;
 	enum vgetstate vs;
 	int error;
 	bool whiteout;
 
 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 	MPASS((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) != 0);
 
 retry:
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	blp = HASH2BUCKETLOCK(hash);
 	mtx_lock(blp);
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	if (__predict_false(ncp == NULL)) {
 		mtx_unlock(blp);
 		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 		counter_u64_add(nummiss, 1);
 		return (0);
 	}
 
 	if (ncp->nc_flag & NCF_NEGATIVE)
 		goto negative_success;
 
 	counter_u64_add(numposhits, 1);
 	*vpp = ncp->nc_vp;
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 	cache_out_ts(ncp, tsp, ticksp);
 	MPASS(dvp != *vpp);
 	vs = vget_prep(*vpp);
 	mtx_unlock(blp);
 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	return (-1);
 negative_success:
 	/*
 	 * We don't get here with regular lookup apart from corner cases.
 	 */
 	if (__predict_true(cnp->cn_nameiop == CREATE)) {
 		if (cnp->cn_flags & ISLASTCN) {
 			counter_u64_add(numnegzaps, 1);
 			error = cache_zap_locked_bucket(ncp, cnp, hash, blp);
 			if (__predict_false(error != 0)) {
 				atomic_add_long(&zap_bucket_fail2, 1);
 				goto retry;
 			}
 			cache_free(ncp);
 			return (0);
 		}
 	}
 
 	whiteout = (ncp->nc_flag & NCF_WHITE);
 	cache_out_ts(ncp, tsp, ticksp);
 	if (cache_neg_hit_prep(ncp))
 		cache_neg_promote(ncp);
 	else
 		cache_neg_hit_finish(ncp);
 	mtx_unlock(blp);
 	if (whiteout)
 		cnp->cn_flags |= ISWHITEOUT;
 	return (ENOENT);
 }
 
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	struct namecache *ncp;
 	uint32_t hash;
 	enum vgetstate vs;
 	int error;
 	bool whiteout, neg_promote;
 	u_short nc_flag;
 
 	MPASS((tsp == NULL && ticksp == NULL) || (tsp != NULL && ticksp != NULL));
 
 #ifdef DEBUG_CACHE
 	if (__predict_false(!doingcache)) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 #endif
 
 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 		if (cnp->cn_namelen == 1)
 			return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.')
 			return (cache_lookup_dotdot(dvp, vpp, cnp, tsp, ticksp));
 	}
 
 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 
 	if ((cnp->cn_flags & (MAKEENTRY | NC_KEEPPOSENTRY)) == 0) {
 		cache_remove_cnp(dvp, cnp);
 		return (0);
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	vfs_smr_enter();
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	if (__predict_false(ncp == NULL)) {
 		vfs_smr_exit();
 		SDT_PROBE2(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr);
 		counter_u64_add(nummiss, 1);
 		return (0);
 	}
 
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if (nc_flag & NCF_NEGATIVE)
 		goto negative_success;
 
 	counter_u64_add(numposhits, 1);
 	*vpp = ncp->nc_vp;
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp);
 	cache_out_ts(ncp, tsp, ticksp);
 	MPASS(dvp != *vpp);
 	if (!cache_ncp_canuse(ncp)) {
 		vfs_smr_exit();
 		*vpp = NULL;
 		goto out_fallback;
 	}
 	vs = vget_prep_smr(*vpp);
 	vfs_smr_exit();
 	if (__predict_false(vs == VGET_NONE)) {
 		*vpp = NULL;
 		goto out_fallback;
 	}
 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 	if (error) {
 		*vpp = NULL;
 		goto out_fallback;
 	}
 	return (-1);
 negative_success:
 	if (cnp->cn_nameiop == CREATE) {
 		if (cnp->cn_flags & ISLASTCN) {
 			vfs_smr_exit();
 			goto out_fallback;
 		}
 	}
 
 	cache_out_ts(ncp, tsp, ticksp);
 	whiteout = (atomic_load_char(&ncp->nc_flag) & NCF_WHITE);
 	neg_promote = cache_neg_hit_prep(ncp);
 	if (!cache_ncp_canuse(ncp)) {
 		cache_neg_hit_abort(ncp);
 		vfs_smr_exit();
 		goto out_fallback;
 	}
 	if (neg_promote) {
 		vfs_smr_exit();
 		if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
 			goto out_fallback;
 	} else {
 		cache_neg_hit_finish(ncp);
 		vfs_smr_exit();
 	}
 	if (whiteout)
 		cnp->cn_flags |= ISWHITEOUT;
 	return (ENOENT);
 out_fallback:
 	return (cache_lookup_fallback(dvp, vpp, cnp, tsp, ticksp));
 }
 
 struct celockstate {
 	struct mtx *vlp[3];
 	struct mtx *blp[2];
 };
 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
 
 static inline void
 cache_celockstate_init(struct celockstate *cel)
 {
 
 	bzero(cel, sizeof(*cel));
 }
 
 static void
 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
     struct vnode *dvp)
 {
 	struct mtx *vlp1, *vlp2;
 
 	MPASS(cel->vlp[0] == NULL);
 	MPASS(cel->vlp[1] == NULL);
 	MPASS(cel->vlp[2] == NULL);
 
 	MPASS(vp != NULL || dvp != NULL);
 
 	vlp1 = VP2VNODELOCK(vp);
 	vlp2 = VP2VNODELOCK(dvp);
 	cache_sort_vnodes(&vlp1, &vlp2);
 
 	if (vlp1 != NULL) {
 		mtx_lock(vlp1);
 		cel->vlp[0] = vlp1;
 	}
 	mtx_lock(vlp2);
 	cel->vlp[1] = vlp2;
 }
 
 static void
 cache_unlock_vnodes_cel(struct celockstate *cel)
 {
 
 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
 
 	if (cel->vlp[0] != NULL)
 		mtx_unlock(cel->vlp[0]);
 	if (cel->vlp[1] != NULL)
 		mtx_unlock(cel->vlp[1]);
 	if (cel->vlp[2] != NULL)
 		mtx_unlock(cel->vlp[2]);
 }
 
 static bool
 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
 {
 	struct mtx *vlp;
 	bool ret;
 
 	cache_assert_vlp_locked(cel->vlp[0]);
 	cache_assert_vlp_locked(cel->vlp[1]);
 	MPASS(cel->vlp[2] == NULL);
 
 	MPASS(vp != NULL);
 	vlp = VP2VNODELOCK(vp);
 
 	ret = true;
 	if (vlp >= cel->vlp[1]) {
 		mtx_lock(vlp);
 	} else {
 		if (mtx_trylock(vlp))
 			goto out;
 		cache_unlock_vnodes_cel(cel);
 		atomic_add_long(&cache_lock_vnodes_cel_3_failures, 1);
 		if (vlp < cel->vlp[0]) {
 			mtx_lock(vlp);
 			mtx_lock(cel->vlp[0]);
 			mtx_lock(cel->vlp[1]);
 		} else {
 			if (cel->vlp[0] != NULL)
 				mtx_lock(cel->vlp[0]);
 			mtx_lock(vlp);
 			mtx_lock(cel->vlp[1]);
 		}
 		ret = false;
 	}
 out:
 	cel->vlp[2] = vlp;
 	return (ret);
 }
 
 static void
 cache_lock_buckets_cel(struct celockstate *cel, struct mtx *blp1,
     struct mtx *blp2)
 {
 
 	MPASS(cel->blp[0] == NULL);
 	MPASS(cel->blp[1] == NULL);
 
 	cache_sort_vnodes(&blp1, &blp2);
 
 	if (blp1 != NULL) {
 		mtx_lock(blp1);
 		cel->blp[0] = blp1;
 	}
 	mtx_lock(blp2);
 	cel->blp[1] = blp2;
 }
 
 static void
 cache_unlock_buckets_cel(struct celockstate *cel)
 {
 
 	if (cel->blp[0] != NULL)
 		mtx_unlock(cel->blp[0]);
 	mtx_unlock(cel->blp[1]);
 }
 
 /*
  * Lock part of the cache affected by the insertion.
  *
  * This means vnodelocks for dvp, vp and the relevant bucketlock.
  * However, insertion can result in removal of an old entry. In this
  * case we have an additional vnode and bucketlock pair to lock.
  *
  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
  * preserving the locking order (smaller address first).
  */
 static void
 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
     uint32_t hash)
 {
 	struct namecache *ncp;
 	struct mtx *blps[2];
 	u_char nc_flag;
 
 	blps[0] = HASH2BUCKETLOCK(hash);
 	for (;;) {
 		blps[1] = NULL;
 		cache_lock_vnodes_cel(cel, dvp, vp);
 		if (vp == NULL || vp->v_type != VDIR)
 			break;
 		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 		if (ncp == NULL)
 			break;
 		nc_flag = atomic_load_char(&ncp->nc_flag);
 		if ((nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 		MPASS(ncp->nc_dvp == vp);
 		blps[1] = NCP2BUCKETLOCK(ncp);
 		if ((nc_flag & NCF_NEGATIVE) != 0)
 			break;
 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 			break;
 		/*
 		 * All vnodes got re-locked. Re-validate the state and if
 		 * nothing changed we are done. Otherwise restart.
 		 */
 		if (ncp == vp->v_cache_dd &&
 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 			break;
 		cache_unlock_vnodes_cel(cel);
 		cel->vlp[0] = NULL;
 		cel->vlp[1] = NULL;
 		cel->vlp[2] = NULL;
 	}
 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
 }
 
 static void
 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
     uint32_t hash)
 {
 	struct namecache *ncp;
 	struct mtx *blps[2];
 	u_char nc_flag;
 
 	blps[0] = HASH2BUCKETLOCK(hash);
 	for (;;) {
 		blps[1] = NULL;
 		cache_lock_vnodes_cel(cel, dvp, vp);
 		ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 		if (ncp == NULL)
 			break;
 		nc_flag = atomic_load_char(&ncp->nc_flag);
 		if ((nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 		MPASS(ncp->nc_dvp == dvp);
 		blps[1] = NCP2BUCKETLOCK(ncp);
 		if ((nc_flag & NCF_NEGATIVE) != 0)
 			break;
 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
 			break;
 		if (ncp == dvp->v_cache_dd &&
 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
 			break;
 		cache_unlock_vnodes_cel(cel);
 		cel->vlp[0] = NULL;
 		cel->vlp[1] = NULL;
 		cel->vlp[2] = NULL;
 	}
 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
 }
 
 static void
 cache_enter_unlock(struct celockstate *cel)
 {
 
 	cache_unlock_buckets_cel(cel);
 	cache_unlock_vnodes_cel(cel);
 }
 
 static void __noinline
 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
     struct componentname *cnp)
 {
 	struct celockstate cel;
 	struct namecache *ncp;
 	uint32_t hash;
 	int len;
 
 	if (atomic_load_ptr(&dvp->v_cache_dd) == NULL)
 		return;
 	len = cnp->cn_namelen;
 	cache_celockstate_init(&cel);
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	cache_enter_lock_dd(&cel, dvp, vp, hash);
 	ncp = dvp->v_cache_dd;
 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
 		cache_zap_locked(ncp);
 	} else {
 		ncp = NULL;
 	}
 	atomic_store_ptr(&dvp->v_cache_dd, NULL);
 	cache_enter_unlock(&cel);
 	if (ncp != NULL)
 		cache_free(ncp);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
 	struct celockstate cel;
 	struct namecache *ncp, *n2, *ndd;
 	struct namecache_ts *ncp_ts;
 	struct nchashhead *ncpp;
 	uint32_t hash;
 	int flag;
 	int len;
 
 	KASSERT(cnp->cn_namelen <= NAME_MAX,
 	    ("%s: passed len %ld exceeds NAME_MAX (%d)", __func__, cnp->cn_namelen,
 	    NAME_MAX));
 	VNPASS(!VN_IS_DOOMED(dvp), dvp);
 	VNPASS(dvp->v_type != VNON, dvp);
 	if (vp != NULL) {
 		VNPASS(!VN_IS_DOOMED(vp), vp);
 		VNPASS(vp->v_type != VNON, vp);
 	}
 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		KASSERT(dvp == vp,
 		    ("%s: different vnodes for dot entry (%p; %p)\n", __func__,
 		    dvp, vp));
 	} else {
 		KASSERT(dvp != vp,
 		    ("%s: same vnode for non-dot entry [%s] (%p)\n", __func__,
 		    cnp->cn_nameptr, dvp));
 	}
 
 #ifdef DEBUG_CACHE
 	if (__predict_false(!doingcache))
 		return;
 #endif
 
 	flag = 0;
 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 		if (cnp->cn_namelen == 1)
 			return;
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			cache_enter_dotdot_prep(dvp, vp, cnp);
 			flag = NCF_ISDOTDOT;
 		}
 	}
 
 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 	if (ncp == NULL)
 		return;
 
 	cache_celockstate_init(&cel);
 	ndd = NULL;
 	ncp_ts = NULL;
 
 	/*
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
 	ncp->nc_flag = flag | NCF_WIP;
 	ncp->nc_vp = vp;
 	if (vp == NULL)
 		cache_neg_init(ncp);
 	ncp->nc_dvp = dvp;
 	if (tsp != NULL) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		ncp_ts->nc_time = *tsp;
 		ncp_ts->nc_ticks = ticks;
 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
 		if (dtsp != NULL) {
 			ncp_ts->nc_dotdottime = *dtsp;
 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
 		}
 	}
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	memcpy(ncp->nc_name, cnp->cn_nameptr, len);
 	ncp->nc_name[len] = '\0';
 	cache_enter_lock(&cel, dvp, vp, hash);
 
 	/*
 	 * See if this vnode or negative entry is already in the cache
 	 * with this name.  This can happen with concurrent lookups of
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
 	CK_SLIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
 			MPASS(cache_ncp_canuse(n2));
 			if ((n2->nc_flag & NCF_NEGATIVE) != 0)
 				KASSERT(vp == NULL,
 				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 				    __func__, NULL, vp, cnp->cn_nameptr));
 			else
 				KASSERT(n2->nc_vp == vp,
 				    ("%s: found entry pointing to a different vnode (%p != %p) ; name [%s]",
 				    __func__, n2->nc_vp, vp, cnp->cn_nameptr));
 			/*
 			 * Entries are supposed to be immutable unless in the
 			 * process of getting destroyed. Accommodating for
 			 * changing timestamps is possible but not worth it.
 			 * This should be harmless in terms of correctness, in
 			 * the worst case resulting in an earlier expiration.
 			 * Alternatively, the found entry can be replaced
 			 * altogether.
 			 */
 			MPASS((n2->nc_flag & (NCF_TS | NCF_DTS)) == (ncp->nc_flag & (NCF_TS | NCF_DTS)));
 #if 0
 			if (tsp != NULL) {
 				KASSERT((n2->nc_flag & NCF_TS) != 0,
 				    ("no NCF_TS"));
 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
 				n2_ts->nc_time = ncp_ts->nc_time;
 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
 				if (dtsp != NULL) {
 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
 				}
 			}
 #endif
 			SDT_PROBE3(vfs, namecache, enter, duplicate, dvp, ncp->nc_name,
 			    vp);
 			goto out_unlock_free;
 		}
 	}
 
 	if (flag == NCF_ISDOTDOT) {
 		/*
 		 * See if we are trying to add .. entry, but some other lookup
 		 * has populated v_cache_dd pointer already.
 		 */
 		if (dvp->v_cache_dd != NULL)
 			goto out_unlock_free;
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
 		atomic_thread_fence_rel();
 		atomic_store_ptr(&dvp->v_cache_dd, ncp);
 	}
 
 	if (vp != NULL) {
 		if (flag != NCF_ISDOTDOT) {
 			/*
 			 * For this case, the cache entry maps both the
 			 * directory name in it and the name ".." for the
 			 * directory's parent.
 			 */
 			if ((ndd = vp->v_cache_dd) != NULL) {
 				if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
 					cache_zap_locked(ndd);
 				else
 					ndd = NULL;
 			}
 			atomic_thread_fence_rel();
 			atomic_store_ptr(&vp->v_cache_dd, ncp);
 		} else if (vp->v_type != VDIR) {
 			if (vp->v_cache_dd != NULL) {
 				atomic_store_ptr(&vp->v_cache_dd, NULL);
 			}
 		}
 	}
 
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			cache_hold_vnode(dvp);
 		}
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp != NULL) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
 		    vp);
 	} else {
 		if (cnp->cn_flags & ISWHITEOUT)
 			atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_WHITE);
 		cache_neg_insert(ncp);
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    ncp->nc_name);
 	}
 
 	/*
 	 * Insert the new namecache entry into the appropriate chain
 	 * within the cache entries table.
 	 */
 	CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 
 	atomic_thread_fence_rel();
 	/*
 	 * Mark the entry as fully constructed.
 	 * It is immutable past this point until its removal.
 	 */
 	atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP);
 
 	cache_enter_unlock(&cel);
 	if (ndd != NULL)
 		cache_free(ndd);
 	return;
 out_unlock_free:
 	cache_enter_unlock(&cel);
 	cache_free(ncp);
 	return;
 }
 
 /*
  * A variant of the above accepting flags.
  *
  * - VFS_CACHE_DROPOLD -- if a conflicting entry is found, drop it.
  *
  * TODO: this routine is a hack. It blindly removes the old entry, even if it
  * happens to match and it is doing it in an inefficient manner. It was added
  * to accommodate NFS which runs into a case where the target for a given name
  * may change from under it. Note this does nothing to solve the following
  * race: 2 callers of cache_enter_time_flags pass a different target vnode for
  * the same [dvp, cnp]. It may be argued that code doing this is broken.
  */
 void
 cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp, int flags)
 {
 
 	MPASS((flags & ~(VFS_CACHE_DROPOLD)) == 0);
 
 	if (flags & VFS_CACHE_DROPOLD)
 		cache_remove_cnp(dvp, cnp);
 	cache_enter_time(dvp, vp, cnp, tsp, dtsp);
 }
 
 static u_long
 cache_roundup_2(u_long val)
 {
 	u_long res;
 
 	for (res = 1; res <= val; res <<= 1)
 		continue;
 
 	return (res);
 }
 
 static struct nchashhead *
 nchinittbl(u_long elements, u_long *hashmask)
 {
 	struct nchashhead *hashtbl;
 	u_long hashsize, i;
 
 	hashsize = cache_roundup_2(elements) / 2;
 
 	hashtbl = malloc(hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK);
 	for (i = 0; i < hashsize; i++)
 		CK_SLIST_INIT(&hashtbl[i]);
 	*hashmask = hashsize - 1;
 	return (hashtbl);
 }
 
 static void
 ncfreetbl(struct nchashhead *hashtbl)
 {
 
 	free(hashtbl, M_VFSCACHE);
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 	u_int i;
 
 	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE,
 	    NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT);
 
 	VFS_SMR_ZONE_SET(cache_zone_small);
 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
 	VFS_SMR_ZONE_SET(cache_zone_large);
 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
 
 	ncsize = desiredvnodes * ncsizefactor;
 	cache_recalc_neg_min();
 	nchashtbl = nchinittbl(ncsize, &nchash);
 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
 		ncbuckethash = 7;
 	if (ncbuckethash > nchash)
 		ncbuckethash = nchash;
 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numbucketlocks; i++)
 		mtx_init(&bucketlocks[i], "ncbuc", NULL, MTX_DUPOK | MTX_RECURSE);
 	ncvnodehash = ncbuckethash;
 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numvnodelocks; i++)
 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 
 	for (i = 0; i < numneglists; i++) {
 		mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
 		TAILQ_INIT(&neglists[i].nl_list);
 		TAILQ_INIT(&neglists[i].nl_hotlist);
 	}
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
 void
 cache_vnode_init(struct vnode *vp)
 {
 
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	vp->v_cache_dd = NULL;
 	cache_prehash(vp);
 }
 
 /*
  * Induce transient cache misses for lockless operation in cache_lookup() by
  * using a temporary hash table.
  *
  * This will force a fs lookup.
  *
  * Synchronisation is done in 2 steps, calling vfs_smr_synchronize each time
  * to observe all CPUs not performing the lookup.
  */
 static void
 cache_changesize_set_temp(struct nchashhead *temptbl, u_long temphash)
 {
 
 	MPASS(temphash < nchash);
 	/*
 	 * Change the size. The new size is smaller and can safely be used
 	 * against the existing table. All lookups which now hash wrong will
 	 * result in a cache miss, which all callers are supposed to know how
 	 * to handle.
 	 */
 	atomic_store_long(&nchash, temphash);
 	atomic_thread_fence_rel();
 	vfs_smr_synchronize();
 	/*
 	 * At this point everyone sees the updated hash value, but they still
 	 * see the old table.
 	 */
 	atomic_store_ptr(&nchashtbl, temptbl);
 	atomic_thread_fence_rel();
 	vfs_smr_synchronize();
 	/*
 	 * At this point everyone sees the updated table pointer and size pair.
 	 */
 }
 
 /*
  * Set the new hash table.
  *
  * Similarly to cache_changesize_set_temp(), this has to synchronize against
  * lockless operation in cache_lookup().
  */
 static void
 cache_changesize_set_new(struct nchashhead *new_tbl, u_long new_hash)
 {
 
 	MPASS(nchash < new_hash);
 	/*
 	 * Change the pointer first. This wont result in out of bounds access
 	 * since the temporary table is guaranteed to be smaller.
 	 */
 	atomic_store_ptr(&nchashtbl, new_tbl);
 	atomic_thread_fence_rel();
 	vfs_smr_synchronize();
 	/*
 	 * At this point everyone sees the updated pointer value, but they
 	 * still see the old size.
 	 */
 	atomic_store_long(&nchash, new_hash);
 	atomic_thread_fence_rel();
 	vfs_smr_synchronize();
 	/*
 	 * At this point everyone sees the updated table pointer and size pair.
 	 */
 }
 
 void
 cache_changesize(u_long newmaxvnodes)
 {
 	struct nchashhead *new_nchashtbl, *old_nchashtbl, *temptbl;
 	u_long new_nchash, old_nchash, temphash;
 	struct namecache *ncp;
 	uint32_t hash;
 	u_long newncsize;
 	u_long i;
 
 	newncsize = newmaxvnodes * ncsizefactor;
 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
 	if (newmaxvnodes < numbucketlocks)
 		newmaxvnodes = numbucketlocks;
 
 	new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash);
 	/* If same hash table size, nothing to do */
 	if (nchash == new_nchash) {
 		ncfreetbl(new_nchashtbl);
 		return;
 	}
 
 	temptbl = nchinittbl(1, &temphash);
 
 	/*
 	 * Move everything from the old hash table to the new table.
 	 * None of the namecache entries in the table can be removed
 	 * because to do so, they have to be removed from the hash table.
 	 */
 	cache_lock_all_vnodes();
 	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	cache_changesize_set_temp(temptbl, temphash);
 	for (i = 0; i <= old_nchash; i++) {
 		while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) {
 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 			    ncp->nc_dvp);
 			CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash);
 			CK_SLIST_INSERT_HEAD(&new_nchashtbl[hash & new_nchash], ncp, nc_hash);
 		}
 	}
 	ncsize = newncsize;
 	cache_recalc_neg_min();
 	cache_changesize_set_new(new_nchashtbl, new_nchash);
 	cache_unlock_all_buckets();
 	cache_unlock_all_vnodes();
 	ncfreetbl(old_nchashtbl);
 	ncfreetbl(temptbl);
 }
 
 /*
  * Remove all entries from and to a particular vnode.
  */
 static void
 cache_purge_impl(struct vnode *vp)
 {
 	struct cache_freebatch batch;
 	struct namecache *ncp;
 	struct mtx *vlp, *vlp2;
 
 	TAILQ_INIT(&batch);
 	vlp = VP2VNODELOCK(vp);
 	vlp2 = NULL;
 	mtx_lock(vlp);
 retry:
 	while (!LIST_EMPTY(&vp->v_cache_src)) {
 		ncp = LIST_FIRST(&vp->v_cache_src);
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 	}
 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 	}
 	ncp = vp->v_cache_dd;
 	if (ncp != NULL) {
 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 		   ("lost dotdot link"));
 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
 			goto retry;
 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 	}
 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
 	mtx_unlock(vlp);
 	if (vlp2 != NULL)
 		mtx_unlock(vlp2);
 	cache_free_batch(&batch);
 }
 
 /*
  * Opportunistic check to see if there is anything to do.
  */
 static bool
 cache_has_entries(struct vnode *vp)
 {
 
 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
 	    atomic_load_ptr(&vp->v_cache_dd) == NULL)
 		return (false);
 	return (true);
 }
 
 void
 cache_purge(struct vnode *vp)
 {
 
 	SDT_PROBE1(vfs, namecache, purge, done, vp);
 	if (!cache_has_entries(vp))
 		return;
 	cache_purge_impl(vp);
 }
 
 /*
  * Only to be used by vgone.
  */
 void
 cache_purge_vgone(struct vnode *vp)
 {
 	struct mtx *vlp;
 
 	VNPASS(VN_IS_DOOMED(vp), vp);
 	if (cache_has_entries(vp)) {
 		cache_purge_impl(vp);
 		return;
 	}
 
 	/*
 	 * Serialize against a potential thread doing cache_purge.
 	 */
 	vlp = VP2VNODELOCK(vp);
 	mtx_wait_unlocked(vlp);
 	if (cache_has_entries(vp)) {
 		cache_purge_impl(vp);
 		return;
 	}
 	return;
 }
 
 /*
  * Remove all negative entries for a particular directory vnode.
  */
 void
 cache_purge_negative(struct vnode *vp)
 {
 	struct cache_freebatch batch;
 	struct namecache *ncp, *nnp;
 	struct mtx *vlp;
 
 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 	if (LIST_EMPTY(&vp->v_cache_src))
 		return;
 	TAILQ_INIT(&batch);
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 		if (!(ncp->nc_flag & NCF_NEGATIVE))
 			continue;
 		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		TAILQ_INSERT_TAIL(&batch, ncp, nc_dst);
 	}
 	mtx_unlock(vlp);
 	cache_free_batch(&batch);
 }
 
 /*
  * Entry points for modifying VOP operations.
  */
 void
 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 {
 
 	ASSERT_VOP_IN_SEQC(fdvp);
 	ASSERT_VOP_IN_SEQC(fvp);
 	ASSERT_VOP_IN_SEQC(tdvp);
 	if (tvp != NULL)
 		ASSERT_VOP_IN_SEQC(tvp);
 
 	cache_purge(fvp);
 	if (tvp != NULL) {
 		cache_purge(tvp);
 		KASSERT(!cache_remove_cnp(tdvp, tcnp),
 		    ("%s: lingering negative entry", __func__));
 	} else {
 		cache_remove_cnp(tdvp, tcnp);
 	}
 
 	/*
 	 * TODO
 	 *
 	 * Historically renaming was always purging all revelang entries,
 	 * but that's quite wasteful. In particular turns out that in many cases
 	 * the target file is immediately accessed after rename, inducing a cache
 	 * miss.
 	 *
 	 * Recode this to reduce relocking and reuse the existing entry (if any)
 	 * instead of just removing it above and allocating a new one here.
 	 */
 	cache_enter(tdvp, fvp, tcnp);
 }
 
 void
 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 {
 
 	ASSERT_VOP_IN_SEQC(dvp);
 	ASSERT_VOP_IN_SEQC(vp);
 	cache_purge(vp);
 }
 
 #ifdef INVARIANTS
 /*
  * Validate that if an entry exists it matches.
  */
 void
 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct namecache *ncp;
 	struct mtx *blp;
 	uint32_t hash;
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	if (CK_SLIST_EMPTY(NCHHASH(hash)))
 		return;
 	blp = HASH2BUCKETLOCK(hash);
 	mtx_lock(blp);
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) {
 			if (ncp->nc_vp != vp)
 				panic("%s: mismatch (%p != %p); ncp %p [%s] dvp %p\n",
 				    __func__, vp, ncp->nc_vp, ncp, ncp->nc_name, ncp->nc_dvp);
 		}
 	}
 	mtx_unlock(blp);
 }
 
 void
 cache_assert_no_entries(struct vnode *vp)
 {
 
 	VNPASS(TAILQ_EMPTY(&vp->v_cache_dst), vp);
 	VNPASS(LIST_EMPTY(&vp->v_cache_src), vp);
 	VNPASS(vp->v_cache_dd == NULL, vp);
 }
 #endif
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
 cache_purgevfs(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 	size_t visited __sdt_used, purged __sdt_used;
 
 	visited = purged = 0;
 	/*
 	 * Somewhat wasteful iteration over all vnodes. Would be better to
 	 * support filtering and avoid the interlock to begin with.
 	 */
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		visited++;
 		if (!cache_has_entries(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		vholdl(vp);
 		VI_UNLOCK(vp);
 		cache_purge(vp);
 		purged++;
 		vdrop(vp);
 	}
 
 	SDT_PROBE3(vfs, namecache, purgevfs, done, mp, visited, purged);
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	int flags = cnp->cn_flags;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = vn_dir_check_exec(dvp, cnp);
 	if (error != 0)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /* Implementation of the getcwd syscall. */
 int
 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 {
 	char *buf, *retbuf;
 	size_t buflen;
 	int error;
 
 	buflen = uap->buflen;
 	if (__predict_false(buflen < 2))
 		return (EINVAL);
 	if (buflen > MAXPATHLEN)
 		buflen = MAXPATHLEN;
 
 	buf = uma_zalloc(namei_zone, M_WAITOK);
 	error = vn_getcwd(buf, &retbuf, &buflen);
 	if (error == 0)
 		error = copyout(retbuf, uap->buf, buflen);
 	uma_zfree(namei_zone, buf);
 	return (error);
 }
 
 int
 vn_getcwd(char *buf, char **retbuf, size_t *buflen)
 {
 	struct pwd *pwd;
 	int error;
 
 	vfs_smr_enter();
 	pwd = pwd_get_smr();
 	error = vn_fullpath_any_smr(pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf,
 	    buflen, 0);
 	VFS_SMR_ASSERT_NOT_ENTERED();
 	if (error < 0) {
 		pwd = pwd_hold(curthread);
 		error = vn_fullpath_any(pwd->pwd_cdir, pwd->pwd_rdir, buf,
 		    retbuf, buflen);
 		pwd_drop(pwd);
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
 		ktrnamei(*retbuf);
 #endif
 	return (error);
 }
 
 /*
  * Canonicalize a path by walking it forward and back.
  *
  * BUGS:
  * - Nothing guarantees the integrity of the entire chain. Consider the case
  *   where the path "foo/bar/baz/qux" is passed, but "bar" is moved out of
  *   "foo" into "quux" during the backwards walk. The result will be
  *   "quux/bar/baz/qux", which could not have been obtained by an incremental
  *   walk in userspace. Moreover, the path we return is inaccessible if the
  *   calling thread lacks permission to traverse "quux".
  */
 static int
 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
     size_t size, int flags, enum uio_seg pathseg)
 {
 	struct nameidata nd;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (flags != 0)
 		return (EINVAL);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | WANTPARENT | AUDITVNODE1,
 	    pathseg, path, fd, &cap_fstat_rights);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 
 	if (nd.ni_vp->v_type == VREG && nd.ni_dvp->v_type != VDIR &&
 	    (nd.ni_vp->v_vflag & VV_ROOT) != 0) {
 		struct vnode *covered_vp;
 
 		/*
 		 * This happens if vp is a file mount. The call to
 		 * vn_fullpath_hardlink can panic if path resolution can't be
 		 * handled without the directory.
 		 *
 		 * To resolve this, we find the vnode which was mounted on -
 		 * this should have a unique global path since we disallow
 		 * mounting on linked files.
 		 */
 		error = vn_lock(nd.ni_vp, LK_SHARED);
 		if (error != 0)
 			goto out;
 		covered_vp = nd.ni_vp->v_mount->mnt_vnodecovered;
 		vref(covered_vp);
 		VOP_UNLOCK(nd.ni_vp);
 		error = vn_fullpath(covered_vp, &retbuf, &freebuf);
 		vrele(covered_vp);
 	} else {
 		error = vn_fullpath_hardlink(nd.ni_vp, nd.ni_dvp,
 		    nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen, &retbuf,
 		    &freebuf, &size);
 	}
 	if (error == 0) {
 		size_t len;
 
 		len = strlen(retbuf) + 1;
 		if (size < len)
 			error = ENAMETOOLONG;
 		else if (pathseg == UIO_USERSPACE)
 			error = copyout(retbuf, buf, len);
 		else
 			memcpy(buf, retbuf, len);
 		free(freebuf, M_TEMP);
 	}
 out:
 	vrele(nd.ni_vp);
 	vrele(nd.ni_dvp);
 	NDFREE_PNBUF(&nd);
 	return (error);
 }
 
 int
 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
 {
 
 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
 	    uap->flags, UIO_USERSPACE));
 }
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf)
 {
 	struct pwd *pwd;
 	char *buf;
 	size_t buflen;
 	int error;
 
 	if (__predict_false(vp == NULL))
 		return (EINVAL);
 
 	buflen = MAXPATHLEN;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vfs_smr_enter();
 	pwd = pwd_get_smr();
 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, &buflen, 0);
 	VFS_SMR_ASSERT_NOT_ENTERED();
 	if (error < 0) {
 		pwd = pwd_hold(curthread);
 		error = vn_fullpath_any(vp, pwd->pwd_rdir, buf, retbuf, &buflen);
 		pwd_drop(pwd);
 	}
 	if (error == 0)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
  * auditing sub-system, as audited pathnames must be absolute, relative to the
  * global root mount point.
  */
 int
 vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf)
 {
 	char *buf;
 	size_t buflen;
 	int error;
 
 	if (__predict_false(vp == NULL))
 		return (EINVAL);
 	buflen = MAXPATHLEN;
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vfs_smr_enter();
 	error = vn_fullpath_any_smr(vp, rootvnode, buf, retbuf, &buflen, 0);
 	VFS_SMR_ASSERT_NOT_ENTERED();
 	if (error < 0) {
 		error = vn_fullpath_any(vp, rootvnode, buf, retbuf, &buflen);
 	}
 	if (error == 0)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 static struct namecache *
 vn_dd_from_dst(struct vnode *vp)
 {
 	struct namecache *ncp;
 
 	cache_assert_vnode_locked(vp);
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			return (ncp);
 	}
 	return (NULL);
 }
 
 int
 vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen)
 {
 	struct vnode *dvp;
 	struct namecache *ncp;
 	struct mtx *vlp;
 	int error;
 
 	vlp = VP2VNODELOCK(*vp);
 	mtx_lock(vlp);
 	ncp = (*vp)->v_cache_dd;
 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT) == 0) {
 		KASSERT(ncp == vn_dd_from_dst(*vp),
 		    ("%s: mismatch for dd entry (%p != %p)", __func__,
 		    ncp, vn_dd_from_dst(*vp)));
 	} else {
 		ncp = vn_dd_from_dst(*vp);
 	}
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
 			mtx_unlock(vlp);
 			vrele(*vp);
 			counter_u64_add(numfullpathfail4, 1);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    vp, NULL);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    ncp->nc_name, vp);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
 		mtx_unlock(vlp);
 		vrele(dvp);
 		return (0);
 	}
 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 
 	mtx_unlock(vlp);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, buf, buflen);
 	vput(*vp);
 	if (error) {
 		counter_u64_add(numfullpathfail2, 1);
 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 		return (error);
 	}
 
 	*vp = dvp;
 	if (VN_IS_DOOMED(dvp)) {
 		/* forced unmount */
 		vrele(dvp);
 		error = ENOENT;
 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 		return (error);
 	}
 	/*
 	 * *vp has its use count incremented still.
 	 */
 
 	return (0);
 }
 
 /*
  * Resolve a directory to a pathname.
  *
  * The name of the directory can always be found in the namecache or fetched
  * from the filesystem. There is also guaranteed to be only one parent, meaning
  * we can just follow vnodes up until we find the root.
  *
  * The vnode must be referenced.
  */
 static int
 vn_fullpath_dir(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
     size_t *len, size_t addend)
 {
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *vp1;
 	size_t buflen;
 	int error;
 	bool slash_prefixed;
 
 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 	VNPASS(vp->v_usecount > 0, vp);
 
 	buflen = *len;
 
 	slash_prefixed = true;
 	if (addend == 0) {
 		MPASS(*len >= 2);
 		buflen--;
 		buf[buflen] = '\0';
 		slash_prefixed = false;
 	}
 
 	error = 0;
 
 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 	counter_u64_add(numfullpathcalls, 1);
 	while (vp != rdir && vp != rootvnode) {
 		/*
 		 * The vp vnode must be already fully constructed,
 		 * since it is either found in namecache or obtained
 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
 		 * without obtaining the vnode lock.
 		 */
 		if ((vp->v_vflag & VV_ROOT) != 0) {
 			vn_lock(vp, LK_RETRY | LK_SHARED);
 
 			/*
 			 * With the vnode locked, check for races with
 			 * unmount, forced or not.  Note that we
 			 * already verified that vp is not equal to
 			 * the root vnode, which means that
 			 * mnt_vnodecovered can be NULL only for the
 			 * case of unmount.
 			 */
 			if (VN_IS_DOOMED(vp) ||
 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
 			    vp1->v_mountedhere != vp->v_mount) {
 				vput(vp);
 				error = ENOENT;
 				SDT_PROBE3(vfs, namecache, fullpath, return,
 				    error, vp, NULL);
 				break;
 			}
 
 			vref(vp1);
 			vput(vp);
 			vp = vp1;
 			continue;
 		}
 		VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
 		error = vn_vptocnp(&vp, buf, &buflen);
 		if (error)
 			break;
 		if (buflen == 0) {
 			vrele(vp);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    startvp, NULL);
 			break;
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = true;
 	}
 	if (error)
 		return (error);
 	if (!slash_prefixed) {
 		if (buflen == 0) {
 			vrele(vp);
 			counter_u64_add(numfullpathfail4, 1);
 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 	}
 	counter_u64_add(numfullpathfound, 1);
 	vrele(vp);
 
 	*retbuf = buf + buflen;
 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
 	*len -= buflen;
 	*len += addend;
 	return (0);
 }
 
 /*
  * Resolve an arbitrary vnode to a pathname.
  *
  * Note 2 caveats:
  * - hardlinks are not tracked, thus if the vnode is not a directory this can
  *   resolve to a different path than the one used to find it
  * - namecache is not mandatory, meaning names are not guaranteed to be added
  *   (in which case resolving fails)
  */
 static void __inline
 cache_rev_failed_impl(int *reason, int line)
 {
 
 	*reason = line;
 }
 #define cache_rev_failed(var)	cache_rev_failed_impl((var), __LINE__)
 
 static int
 vn_fullpath_any_smr(struct vnode *vp, struct vnode *rdir, char *buf,
     char **retbuf, size_t *buflen, size_t addend)
 {
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *tvp;
 	struct mount *mp;
 	struct namecache *ncp;
 	size_t orig_buflen;
 	int reason;
 	int error;
 #ifdef KDTRACE_HOOKS
 	int i;
 #endif
 	seqc_t vp_seqc, tvp_seqc;
 	u_char nc_flag;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 		vfs_smr_exit();
 		return (-1);
 	}
 
 	orig_buflen = *buflen;
 
 	if (addend == 0) {
 		MPASS(*buflen >= 2);
 		*buflen -= 1;
 		buf[*buflen] = '\0';
 	}
 
 	if (vp == rdir || vp == rootvnode) {
 		if (addend == 0) {
 			*buflen -= 1;
 			buf[*buflen] = '/';
 		}
 		goto out_ok;
 	}
 
 #ifdef KDTRACE_HOOKS
 	i = 0;
 #endif
 	error = -1;
 	ncp = NULL; /* for sdt probe down below */
 	vp_seqc = vn_seqc_read_any(vp);
 	if (seqc_in_modify(vp_seqc)) {
 		cache_rev_failed(&reason);
 		goto out_abort;
 	}
 
 	for (;;) {
 #ifdef KDTRACE_HOOKS
 		i++;
 #endif
 		if ((vp->v_vflag & VV_ROOT) != 0) {
 			mp = atomic_load_ptr(&vp->v_mount);
 			if (mp == NULL) {
 				cache_rev_failed(&reason);
 				goto out_abort;
 			}
 			tvp = atomic_load_ptr(&mp->mnt_vnodecovered);
 			tvp_seqc = vn_seqc_read_any(tvp);
 			if (seqc_in_modify(tvp_seqc)) {
 				cache_rev_failed(&reason);
 				goto out_abort;
 			}
 			if (!vn_seqc_consistent(vp, vp_seqc)) {
 				cache_rev_failed(&reason);
 				goto out_abort;
 			}
 			vp = tvp;
 			vp_seqc = tvp_seqc;
 			continue;
 		}
 		ncp = atomic_load_consume_ptr(&vp->v_cache_dd);
 		if (ncp == NULL) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		nc_flag = atomic_load_char(&ncp->nc_flag);
 		if ((nc_flag & NCF_ISDOTDOT) != 0) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		if (ncp->nc_nlen >= *buflen) {
 			cache_rev_failed(&reason);
 			error = ENOMEM;
 			goto out_abort;
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 		*buflen -= 1;
 		buf[*buflen] = '/';
 		tvp = ncp->nc_dvp;
 		tvp_seqc = vn_seqc_read_any(tvp);
 		if (seqc_in_modify(tvp_seqc)) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		if (!vn_seqc_consistent(vp, vp_seqc)) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		/*
 		 * Acquire fence provided by vn_seqc_read_any above.
 		 */
 		if (__predict_false(atomic_load_ptr(&vp->v_cache_dd) != ncp)) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		if (!cache_ncp_canuse(ncp)) {
 			cache_rev_failed(&reason);
 			goto out_abort;
 		}
 		vp = tvp;
 		vp_seqc = tvp_seqc;
 		if (vp == rdir || vp == rootvnode)
 			break;
 	}
 out_ok:
 	vfs_smr_exit();
 	*retbuf = buf + *buflen;
 	*buflen = orig_buflen - *buflen + addend;
 	SDT_PROBE2(vfs, namecache, fullpath_smr, hit, startvp, *retbuf);
 	return (0);
 
 out_abort:
 	*buflen = orig_buflen;
 	SDT_PROBE4(vfs, namecache, fullpath_smr, miss, startvp, ncp, reason, i);
 	vfs_smr_exit();
 	return (error);
 }
 
 static int
 vn_fullpath_any(struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf,
     size_t *buflen)
 {
 	size_t orig_buflen, addend;
 	int error;
 
 	if (*buflen < 2)
 		return (EINVAL);
 
 	orig_buflen = *buflen;
 
 	vref(vp);
 	addend = 0;
 	if (vp->v_type != VDIR) {
 		*buflen -= 1;
 		buf[*buflen] = '\0';
 		error = vn_vptocnp(&vp, buf, buflen);
 		if (error)
 			return (error);
 		if (*buflen == 0) {
 			vrele(vp);
 			return (ENOMEM);
 		}
 		*buflen -= 1;
 		buf[*buflen] = '/';
 		addend = orig_buflen - *buflen;
 	}
 
 	return (vn_fullpath_dir(vp, rdir, buf, retbuf, buflen, addend));
 }
 
 /*
  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
  *
  * Since the namecache does not track hardlinks, the caller is expected to
  * first look up the target vnode with WANTPARENT flag passed to namei to get
  * dvp and vp.
  *
  * Then we have 2 cases:
  * - if the found vnode is a directory, the path can be constructed just by
  *   following names up the chain
  * - otherwise we populate the buffer with the saved name and start resolving
  *   from the parent
  */
 int
 vn_fullpath_hardlink(struct vnode *vp, struct vnode *dvp,
     const char *hrdl_name, size_t hrdl_name_length,
     char **retbuf, char **freebuf, size_t *buflen)
 {
 	char *buf, *tmpbuf;
 	struct pwd *pwd;
 	size_t addend;
 	int error;
 	__enum_uint8(vtype) type;
 
 	if (*buflen < 2)
 		return (EINVAL);
 	if (*buflen > MAXPATHLEN)
 		*buflen = MAXPATHLEN;
 
 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
 
 	addend = 0;
 
 	/*
 	 * Check for VBAD to work around the vp_crossmp bug in lookup().
 	 *
 	 * For example consider tmpfs on /tmp and realpath /tmp. ni_vp will be
 	 * set to mount point's root vnode while ni_dvp will be vp_crossmp.
 	 * If the type is VDIR (like in this very case) we can skip looking
 	 * at ni_dvp in the first place. However, since vnodes get passed here
 	 * unlocked the target may transition to doomed state (type == VBAD)
 	 * before we get to evaluate the condition. If this happens, we will
 	 * populate part of the buffer and descend to vn_fullpath_dir with
 	 * vp == vp_crossmp. Prevent the problem by checking for VBAD.
 	 */
 	type = atomic_load_8(&vp->v_type);
 	if (type == VBAD) {
 		error = ENOENT;
 		goto out_bad;
 	}
 	if (type != VDIR) {
 		addend = hrdl_name_length + 2;
 		if (*buflen < addend) {
 			error = ENOMEM;
 			goto out_bad;
 		}
 		*buflen -= addend;
 		tmpbuf = buf + *buflen;
 		tmpbuf[0] = '/';
 		memcpy(&tmpbuf[1], hrdl_name, hrdl_name_length);
 		tmpbuf[addend - 1] = '\0';
 		vp = dvp;
 	}
 
 	vfs_smr_enter();
 	pwd = pwd_get_smr();
 	error = vn_fullpath_any_smr(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 	    addend);
 	VFS_SMR_ASSERT_NOT_ENTERED();
 	if (error < 0) {
 		pwd = pwd_hold(curthread);
 		vref(vp);
 		error = vn_fullpath_dir(vp, pwd->pwd_rdir, buf, retbuf, buflen,
 		    addend);
 		pwd_drop(pwd);
 	}
 	if (error != 0)
 		goto out_bad;
 
 	*freebuf = buf;
 
 	return (0);
 out_bad:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 struct vnode *
 vn_dir_dd_ino(struct vnode *vp)
 {
 	struct namecache *ncp;
 	struct vnode *ddvp;
 	struct mtx *vlp;
 	enum vgetstate vs;
 
 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
 		vs = vget_prep(ddvp);
 		mtx_unlock(vlp);
 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
 			return (NULL);
 		return (ddvp);
 	}
 	mtx_unlock(vlp);
 	return (NULL);
 }
 
 int
 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 {
 	struct namecache *ncp;
 	struct mtx *vlp;
 	int l;
 
 	vlp = VP2VNODELOCK(vp);
 	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	if (ncp == NULL) {
 		mtx_unlock(vlp);
 		return (ENOENT);
 	}
 	l = min(ncp->nc_nlen, buflen - 1);
 	memcpy(buf, ncp->nc_name, l);
 	mtx_unlock(vlp);
 	buf[l] = '\0';
 	return (0);
 }
 
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
  *
  * Requires a locked, referenced vnode.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  *
  * If vp is a directory, the call to vn_fullpath_global() always succeeds
  * because it falls back to the ".." lookup if the namecache lookup fails.
  */
 int
 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
     u_int pathlen)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Construct global filesystem path from vp. */
 	VOP_UNLOCK(vp);
 	error = vn_fullpath_global(vp, &rpath, &fbuf);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE_PNBUF(&nd);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * This is similar to vn_path_to_global_path but allows for regular
  * files which may not be present in the cache.
  *
  * Requires a locked, referenced vnode.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  */
 int
 vn_path_to_global_path_hardlink(struct thread *td, struct vnode *vp,
     struct vnode *dvp, char *path, u_int pathlen, const char *leaf_name,
     size_t leaf_length)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	size_t len;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/*
 	 * Construct global filesystem path from dvp, vp and leaf
 	 * name.
 	 */
 	VOP_UNLOCK(vp);
 	len = pathlen;
 	error = vn_fullpath_hardlink(vp, dvp, leaf_name, leaf_length,
 	    &rpath, &fbuf, &len);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE_PNBUF(&nd);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 static void
 db_print_vpath(struct vnode *vp)
 {
 
 	while (vp != NULL) {
 		db_printf("%p: ", vp);
 		if (vp == rootvnode) {
 			db_printf("/");
 			vp = NULL;
 		} else {
 			if (vp->v_vflag & VV_ROOT) {
 				db_printf("<mount point>");
 				vp = vp->v_mount->mnt_vnodecovered;
 			} else {
 				struct namecache *ncp;
 				char *ncn;
 				int i;
 
 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
 				if (ncp != NULL) {
 					ncn = ncp->nc_name;
 					for (i = 0; i < ncp->nc_nlen; i++)
 						db_printf("%c", *ncn++);
 					vp = ncp->nc_dvp;
 				} else {
 					vp = NULL;
 				}
 			}
 		}
 		db_printf("\n");
 	}
 
 	return;
 }
 
 DB_SHOW_COMMAND(vpath, db_show_vpath)
 {
 	struct vnode *vp;
 
 	if (!have_addr) {
 		db_printf("usage: show vpath <struct vnode *>\n");
 		return;
 	}
 
 	vp = (struct vnode *)addr;
 	db_print_vpath(vp);
 }
 
 #endif
 
 static int cache_fast_lookup = 1;
 
 #define CACHE_FPL_FAILED	-2020
 
 static int
 cache_vop_bad_vexec(struct vop_fplookup_vexec_args *v)
 {
 	vn_printf(v->a_vp, "no proper vop_fplookup_vexec\n");
 	panic("no proper vop_fplookup_vexec");
 }
 
 static int
 cache_vop_bad_symlink(struct vop_fplookup_symlink_args *v)
 {
 	vn_printf(v->a_vp, "no proper vop_fplookup_symlink\n");
 	panic("no proper vop_fplookup_symlink");
 }
 
 void
 cache_vop_vector_register(struct vop_vector *v)
 {
 	size_t ops;
 
 	ops = 0;
 	if (v->vop_fplookup_vexec != NULL) {
 		ops++;
 	}
 	if (v->vop_fplookup_symlink != NULL) {
 		ops++;
 	}
 
 	if (ops == 2) {
 		return;
 	}
 
 	if (ops == 0) {
 		v->vop_fplookup_vexec = cache_vop_bad_vexec;
 		v->vop_fplookup_symlink = cache_vop_bad_symlink;
 		return;
 	}
 
 	printf("%s: invalid vop vector %p -- either all or none fplookup vops "
 	    "need to be provided",  __func__, v);
 	if (v->vop_fplookup_vexec == NULL) {
 		printf("%s: missing vop_fplookup_vexec\n", __func__);
 	}
 	if (v->vop_fplookup_symlink == NULL) {
 		printf("%s: missing vop_fplookup_symlink\n", __func__);
 	}
 	panic("bad vop vector %p", v);
 }
 
 #ifdef INVARIANTS
 void
 cache_validate_vop_vector(struct mount *mp, struct vop_vector *vops)
 {
 	if (mp == NULL)
 		return;
 
 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 		return;
 
 	if (vops->vop_fplookup_vexec == NULL ||
 	    vops->vop_fplookup_vexec == cache_vop_bad_vexec)
 		panic("bad vop_fplookup_vexec on vector %p for filesystem %s",
 		    vops, mp->mnt_vfc->vfc_name);
 
 	if (vops->vop_fplookup_symlink == NULL ||
 	    vops->vop_fplookup_symlink == cache_vop_bad_symlink)
 		panic("bad vop_fplookup_symlink on vector %p for filesystem %s",
 		    vops, mp->mnt_vfc->vfc_name);
 }
 #endif
 
 void
 cache_fast_lookup_enabled_recalc(void)
 {
 	int lookup_flag;
 	int mac_on;
 
 #ifdef MAC
 	mac_on = mac_vnode_check_lookup_enabled();
 	mac_on |= mac_vnode_check_readlink_enabled();
 #else
 	mac_on = 0;
 #endif
 
 	lookup_flag = atomic_load_int(&cache_fast_lookup);
 	if (lookup_flag && !mac_on) {
 		atomic_store_char(&cache_fast_lookup_enabled, true);
 	} else {
 		atomic_store_char(&cache_fast_lookup_enabled, false);
 	}
 }
 
 static int
 syscal_vfs_cache_fast_lookup(SYSCTL_HANDLER_ARGS)
 {
 	int error, old;
 
 	old = atomic_load_int(&cache_fast_lookup);
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0 && req->newptr && old != atomic_load_int(&cache_fast_lookup))
 		cache_fast_lookup_enabled_recalc();
 	return (error);
 }
 SYSCTL_PROC(_vfs_cache_param, OID_AUTO, fast_lookup, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
     &cache_fast_lookup, 0, syscal_vfs_cache_fast_lookup, "IU", "");
 
 /*
  * Components of nameidata (or objects it can point to) which may
  * need restoring in case fast path lookup fails.
  */
 struct nameidata_outer {
 	size_t ni_pathlen;
 	int cn_flags;
 };
 
 struct nameidata_saved {
 #ifdef INVARIANTS
 	char *cn_nameptr;
 	size_t ni_pathlen;
 #endif
 };
 
 #ifdef INVARIANTS
 struct cache_fpl_debug {
 	size_t ni_pathlen;
 };
 #endif
 
 struct cache_fpl {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	char *nulchar;
 	struct vnode *dvp;
 	struct vnode *tvp;
 	seqc_t dvp_seqc;
 	seqc_t tvp_seqc;
 	uint32_t hash;
 	struct nameidata_saved snd;
 	struct nameidata_outer snd_outer;
 	int line;
 	enum cache_fpl_status status:8;
 	bool in_smr;
 	bool fsearch;
 	struct pwd **pwd;
 #ifdef INVARIANTS
 	struct cache_fpl_debug debug;
 #endif
 };
 
 static bool cache_fplookup_mp_supported(struct mount *mp);
 static bool cache_fplookup_is_mp(struct cache_fpl *fpl);
 static int cache_fplookup_cross_mount(struct cache_fpl *fpl);
 static int cache_fplookup_partial_setup(struct cache_fpl *fpl);
 static int cache_fplookup_skip_slashes(struct cache_fpl *fpl);
 static int cache_fplookup_trailingslash(struct cache_fpl *fpl);
 static void cache_fpl_pathlen_dec(struct cache_fpl *fpl);
 static void cache_fpl_pathlen_inc(struct cache_fpl *fpl);
 static void cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n);
 static void cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n);
 
 static void
 cache_fpl_cleanup_cnp(struct componentname *cnp)
 {
 
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 }
 
 static struct vnode *
 cache_fpl_handle_root(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	MPASS(*(cnp->cn_nameptr) == '/');
 	cnp->cn_nameptr++;
 	cache_fpl_pathlen_dec(fpl);
 
 	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 		do {
 			cnp->cn_nameptr++;
 			cache_fpl_pathlen_dec(fpl);
 		} while (*(cnp->cn_nameptr) == '/');
 	}
 
 	return (ndp->ni_rootdir);
 }
 
 static void
 cache_fpl_checkpoint_outer(struct cache_fpl *fpl)
 {
 
 	fpl->snd_outer.ni_pathlen = fpl->ndp->ni_pathlen;
 	fpl->snd_outer.cn_flags = fpl->ndp->ni_cnd.cn_flags;
 }
 
 static void
 cache_fpl_checkpoint(struct cache_fpl *fpl)
 {
 
 #ifdef INVARIANTS
 	fpl->snd.cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr;
 	fpl->snd.ni_pathlen = fpl->debug.ni_pathlen;
 #endif
 }
 
 static void
 cache_fpl_restore_partial(struct cache_fpl *fpl)
 {
 
 	fpl->ndp->ni_cnd.cn_flags = fpl->snd_outer.cn_flags;
 #ifdef INVARIANTS
 	fpl->debug.ni_pathlen = fpl->snd.ni_pathlen;
 #endif
 }
 
 static void
 cache_fpl_restore_abort(struct cache_fpl *fpl)
 {
 
 	cache_fpl_restore_partial(fpl);
 	/*
 	 * It is 0 on entry by API contract.
 	 */
 	fpl->ndp->ni_resflags = 0;
 	fpl->ndp->ni_cnd.cn_nameptr = fpl->ndp->ni_cnd.cn_pnbuf;
 	fpl->ndp->ni_pathlen = fpl->snd_outer.ni_pathlen;
 }
 
 #ifdef INVARIANTS
 #define cache_fpl_smr_assert_entered(fpl) ({			\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == true);				\
 	VFS_SMR_ASSERT_ENTERED();				\
 })
 #define cache_fpl_smr_assert_not_entered(fpl) ({		\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == false);				\
 	VFS_SMR_ASSERT_NOT_ENTERED();				\
 })
 static void
 cache_fpl_assert_status(struct cache_fpl *fpl)
 {
 
 	switch (fpl->status) {
 	case CACHE_FPL_STATUS_UNSET:
 		__assert_unreachable();
 		break;
 	case CACHE_FPL_STATUS_DESTROYED:
 	case CACHE_FPL_STATUS_ABORTED:
 	case CACHE_FPL_STATUS_PARTIAL:
 	case CACHE_FPL_STATUS_HANDLED:
 		break;
 	}
 }
 #else
 #define cache_fpl_smr_assert_entered(fpl) do { } while (0)
 #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0)
 #define cache_fpl_assert_status(fpl) do { } while (0)
 #endif
 
 #define cache_fpl_smr_enter_initial(fpl) ({			\
 	struct cache_fpl *_fpl = (fpl);				\
 	vfs_smr_enter();					\
 	_fpl->in_smr = true;					\
 })
 
 #define cache_fpl_smr_enter(fpl) ({				\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == false);				\
 	vfs_smr_enter();					\
 	_fpl->in_smr = true;					\
 })
 
 #define cache_fpl_smr_exit(fpl) ({				\
 	struct cache_fpl *_fpl = (fpl);				\
 	MPASS(_fpl->in_smr == true);				\
 	vfs_smr_exit();						\
 	_fpl->in_smr = false;					\
 })
 
 static int
 cache_fpl_aborted_early_impl(struct cache_fpl *fpl, int line)
 {
 
 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 		    ("%s: converting to abort from %d at %d, set at %d\n",
 		    __func__, fpl->status, line, fpl->line));
 	}
 	cache_fpl_smr_assert_not_entered(fpl);
 	fpl->status = CACHE_FPL_STATUS_ABORTED;
 	fpl->line = line;
 	return (CACHE_FPL_FAILED);
 }
 
 #define cache_fpl_aborted_early(x)	cache_fpl_aborted_early_impl((x), __LINE__)
 
 static int __noinline
 cache_fpl_aborted_impl(struct cache_fpl *fpl, int line)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	if (fpl->status != CACHE_FPL_STATUS_UNSET) {
 		KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL,
 		    ("%s: converting to abort from %d at %d, set at %d\n",
 		    __func__, fpl->status, line, fpl->line));
 	}
 	fpl->status = CACHE_FPL_STATUS_ABORTED;
 	fpl->line = line;
 	if (fpl->in_smr)
 		cache_fpl_smr_exit(fpl);
 	cache_fpl_restore_abort(fpl);
 	/*
 	 * Resolving symlinks overwrites data passed by the caller.
 	 * Let namei know.
 	 */
 	if (ndp->ni_loopcnt > 0) {
 		fpl->status = CACHE_FPL_STATUS_DESTROYED;
 		cache_fpl_cleanup_cnp(cnp);
 	}
 	return (CACHE_FPL_FAILED);
 }
 
 #define cache_fpl_aborted(x)	cache_fpl_aborted_impl((x), __LINE__)
 
 static int __noinline
 cache_fpl_partial_impl(struct cache_fpl *fpl, int line)
 {
 
 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 	    ("%s: setting to partial at %d, but already set to %d at %d\n",
 	    __func__, line, fpl->status, fpl->line));
 	cache_fpl_smr_assert_entered(fpl);
 	fpl->status = CACHE_FPL_STATUS_PARTIAL;
 	fpl->line = line;
 	return (cache_fplookup_partial_setup(fpl));
 }
 
 #define cache_fpl_partial(x)	cache_fpl_partial_impl((x), __LINE__)
 
 static int
 cache_fpl_handled_impl(struct cache_fpl *fpl, int line)
 {
 
 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
 	    __func__, line, fpl->status, fpl->line));
 	cache_fpl_smr_assert_not_entered(fpl);
 	fpl->status = CACHE_FPL_STATUS_HANDLED;
 	fpl->line = line;
 	return (0);
 }
 
 #define cache_fpl_handled(x)	cache_fpl_handled_impl((x), __LINE__)
 
 static int
 cache_fpl_handled_error_impl(struct cache_fpl *fpl, int error, int line)
 {
 
 	KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET,
 	    ("%s: setting to handled at %d, but already set to %d at %d\n",
 	    __func__, line, fpl->status, fpl->line));
 	MPASS(error != 0);
 	MPASS(error != CACHE_FPL_FAILED);
 	cache_fpl_smr_assert_not_entered(fpl);
 	fpl->status = CACHE_FPL_STATUS_HANDLED;
 	fpl->line = line;
 	fpl->dvp = NULL;
 	fpl->tvp = NULL;
 	return (error);
 }
 
 #define cache_fpl_handled_error(x, e)	cache_fpl_handled_error_impl((x), (e), __LINE__)
 
 static bool
 cache_fpl_terminated(struct cache_fpl *fpl)
 {
 
 	return (fpl->status != CACHE_FPL_STATUS_UNSET);
 }
 
 #define CACHE_FPL_SUPPORTED_CN_FLAGS \
 	(NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \
 	 FAILIFEXISTS | FOLLOW | EMPTYPATH | LOCKSHARED | ISRESTARTED | WILLBEDIR | \
 	 ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2 | NOCAPCHECK | OPENREAD | \
 	 OPENWRITE | WANTIOCTLCAPS)
 
 #define CACHE_FPL_INTERNAL_CN_FLAGS \
 	(ISDOTDOT | MAKEENTRY | ISLASTCN)
 
 _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
     "supported and internal flags overlap");
 
 static bool
 cache_fpl_islastcn(struct nameidata *ndp)
 {
 
 	return (*ndp->ni_next == 0);
 }
 
 static bool
 cache_fpl_istrailingslash(struct cache_fpl *fpl)
 {
 
 	MPASS(fpl->nulchar > fpl->cnp->cn_pnbuf);
 	return (*(fpl->nulchar - 1) == '/');
 }
 
 static bool
 cache_fpl_isdotdot(struct componentname *cnp)
 {
 
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		return (true);
 	return (false);
 }
 
 static bool
 cache_can_fplookup(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct thread *td;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	td = curthread;
 
 	if (!atomic_load_char(&cache_fast_lookup_enabled)) {
 		cache_fpl_aborted_early(fpl);
 		return (false);
 	}
 	if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) {
 		cache_fpl_aborted_early(fpl);
 		return (false);
 	}
 	if (IN_CAPABILITY_MODE(td) || CAP_TRACING(td)) {
 		cache_fpl_aborted_early(fpl);
 		return (false);
 	}
 	if (AUDITING_TD(td)) {
 		cache_fpl_aborted_early(fpl);
 		return (false);
 	}
 	if (ndp->ni_startdir != NULL) {
 		cache_fpl_aborted_early(fpl);
 		return (false);
 	}
 	return (true);
 }
 
 static int __noinline
 cache_fplookup_dirfd(struct cache_fpl *fpl, struct vnode **vpp)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	int error;
 	bool fsearch;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	error = fgetvp_lookup_smr(ndp, vpp, &fsearch);
 	if (__predict_false(error != 0)) {
 		return (cache_fpl_aborted(fpl));
 	}
 	fpl->fsearch = fsearch;
 	if ((*vpp)->v_type != VDIR) {
 		if (!((cnp->cn_flags & EMPTYPATH) != 0 && cnp->cn_pnbuf[0] == '\0')) {
 			cache_fpl_smr_exit(fpl);
 			return (cache_fpl_handled_error(fpl, ENOTDIR));
 		}
 	}
 	return (0);
 }
 
 static int __noinline
 cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp,
     uint32_t hash)
 {
 	struct componentname *cnp;
 	struct vnode *dvp;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	cache_fpl_smr_exit(fpl);
 	if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
 		return (cache_fpl_handled_error(fpl, ENOENT));
 	else
 		return (cache_fpl_aborted(fpl));
 }
 
 /*
  * The target vnode is not supported, prepare for the slow path to take over.
  */
 static int __noinline
 cache_fplookup_partial_setup(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	enum vgetstate dvs;
 	struct vnode *dvp;
 	struct pwd *pwd;
 	seqc_t dvp_seqc;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	pwd = *(fpl->pwd);
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 
 	if (!pwd_hold_smr(pwd)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	/*
 	 * Note that seqc is checked before the vnode is locked, so by
 	 * the time regular lookup gets to it it may have moved.
 	 *
 	 * Ultimately this does not affect correctness, any lookup errors
 	 * are userspace racing with itself. It is guaranteed that any
 	 * path which ultimately gets found could also have been found
 	 * by regular lookup going all the way in absence of concurrent
 	 * modifications.
 	 */
 	dvs = vget_prep_smr(dvp);
 	cache_fpl_smr_exit(fpl);
 	if (__predict_false(dvs == VGET_NONE)) {
 		pwd_drop(pwd);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	vget_finish_ref(dvp, dvs);
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vrele(dvp);
 		pwd_drop(pwd);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_restore_partial(fpl);
 #ifdef INVARIANTS
 	if (cnp->cn_nameptr != fpl->snd.cn_nameptr) {
 		panic("%s: cn_nameptr mismatch (%p != %p) full [%s]\n", __func__,
 		    cnp->cn_nameptr, fpl->snd.cn_nameptr, cnp->cn_pnbuf);
 	}
 #endif
 
 	ndp->ni_startdir = dvp;
 	cnp->cn_flags |= MAKEENTRY;
 	if (cache_fpl_islastcn(ndp))
 		cnp->cn_flags |= ISLASTCN;
 	if (cache_fpl_isdotdot(cnp))
 		cnp->cn_flags |= ISDOTDOT;
 
 	/*
 	 * Skip potential extra slashes parsing did not take care of.
 	 * cache_fplookup_skip_slashes explains the mechanism.
 	 */
 	if (__predict_false(*(cnp->cn_nameptr) == '/')) {
 		do {
 			cnp->cn_nameptr++;
 			cache_fpl_pathlen_dec(fpl);
 		} while (*(cnp->cn_nameptr) == '/');
 	}
 
 	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 #ifdef INVARIANTS
 	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 	}
 #endif
 	return (0);
 }
 
 static int
 cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs)
 {
 	struct componentname *cnp;
 	struct vnode *tvp;
 	seqc_t tvp_seqc;
 	int error, lkflags;
 
 	cnp = fpl->cnp;
 	tvp = fpl->tvp;
 	tvp_seqc = fpl->tvp_seqc;
 
 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
 		lkflags = LK_SHARED;
 		if ((cnp->cn_flags & LOCKSHARED) == 0)
 			lkflags = LK_EXCLUSIVE;
 		error = vget_finish(tvp, lkflags, tvs);
 		if (__predict_false(error != 0)) {
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(tvp, tvs);
 	}
 
 	if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 		if ((cnp->cn_flags & LOCKLEAF) != 0)
 			vput(tvp);
 		else
 			vrele(tvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	return (cache_fpl_handled(fpl));
 }
 
 /*
  * They want to possibly modify the state of the namecache.
  */
 static int __noinline
 cache_fplookup_final_modifying(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp __diagused;
 	struct componentname *cnp;
 	enum vgetstate dvs;
 	struct vnode *dvp, *tvp;
 	struct mount *mp;
 	seqc_t dvp_seqc;
 	int error;
 	bool docache;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 
 	MPASS(*(cnp->cn_nameptr) != '/');
 	MPASS(cache_fpl_islastcn(ndp));
 	if ((cnp->cn_flags & LOCKPARENT) == 0)
 		MPASS((cnp->cn_flags & WANTPARENT) != 0);
 	MPASS((cnp->cn_flags & TRAILINGSLASH) == 0);
 	MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE ||
 	    cnp->cn_nameiop == RENAME);
 	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 		docache = false;
 
 	/*
 	 * Regular lookup nulifies the slash, which we don't do here.
 	 * Don't take chances with filesystem routines seeing it for
 	 * the last entry.
 	 */
 	if (cache_fpl_istrailingslash(fpl)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	mp = atomic_load_ptr(&dvp->v_mount);
 	if (__predict_false(mp == NULL)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if (__predict_false(mp->mnt_flag & MNT_RDONLY)) {
 		cache_fpl_smr_exit(fpl);
 		/*
 		 * Original code keeps not checking for CREATE which
 		 * might be a bug. For now let the old lookup decide.
 		 */
 		if (cnp->cn_nameiop == CREATE) {
 			return (cache_fpl_aborted(fpl));
 		}
 		return (cache_fpl_handled_error(fpl, EROFS));
 	}
 
 	if (fpl->tvp != NULL && (cnp->cn_flags & FAILIFEXISTS) != 0) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, EEXIST));
 	}
 
 	/*
 	 * Secure access to dvp; check cache_fplookup_partial_setup for
 	 * reasoning.
 	 *
 	 * XXX At least UFS requires its lookup routine to be called for
 	 * the last path component, which leads to some level of complication
 	 * and inefficiency:
 	 * - the target routine always locks the target vnode, but our caller
 	 *   may not need it locked
 	 * - some of the VOP machinery asserts that the parent is locked, which
 	 *   once more may be not required
 	 *
 	 * TODO: add a flag for filesystems which don't need this.
 	 */
 	dvs = vget_prep_smr(dvp);
 	cache_fpl_smr_exit(fpl);
 	if (__predict_false(dvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	vget_finish_ref(dvp, dvs);
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	error = vn_lock(dvp, LK_EXCLUSIVE);
 	if (__predict_false(error != 0)) {
 		vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	tvp = NULL;
 	cnp->cn_flags |= ISLASTCN;
 	if (docache)
 		cnp->cn_flags |= MAKEENTRY;
 	if (cache_fpl_isdotdot(cnp))
 		cnp->cn_flags |= ISDOTDOT;
 	cnp->cn_lkflags = LK_EXCLUSIVE;
 	error = VOP_LOOKUP(dvp, &tvp, cnp);
 	switch (error) {
 	case EJUSTRETURN:
 	case 0:
 		break;
 	case ENOTDIR:
 	case ENOENT:
 		vput(dvp);
 		return (cache_fpl_handled_error(fpl, error));
 	default:
 		vput(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	fpl->tvp = tvp;
 
 	if (tvp == NULL) {
 		MPASS(error == EJUSTRETURN);
 		if ((cnp->cn_flags & LOCKPARENT) == 0) {
 			VOP_UNLOCK(dvp);
 		}
 		return (cache_fpl_handled(fpl));
 	}
 
 	/*
 	 * There are very hairy corner cases concerning various flag combinations
 	 * and locking state. In particular here we only hold one lock instead of
 	 * two.
 	 *
 	 * Skip the complexity as it is of no significance for normal workloads.
 	 */
 	if (__predict_false(tvp == dvp)) {
 		vput(dvp);
 		vrele(tvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	/*
 	 * If they want the symlink itself we are fine, but if they want to
 	 * follow it regular lookup has to be engaged.
 	 */
 	if (tvp->v_type == VLNK) {
 		if ((cnp->cn_flags & FOLLOW) != 0) {
 			vput(dvp);
 			vput(tvp);
 			return (cache_fpl_aborted(fpl));
 		}
 	}
 
 	/*
 	 * Since we expect this to be the terminal vnode it should almost never
 	 * be a mount point.
 	 */
 	if (__predict_false(cache_fplookup_is_mp(fpl))) {
 		vput(dvp);
 		vput(tvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if ((cnp->cn_flags & FAILIFEXISTS) != 0) {
 		vput(dvp);
 		vput(tvp);
 		return (cache_fpl_handled_error(fpl, EEXIST));
 	}
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
 		VOP_UNLOCK(tvp);
 	}
 
 	if ((cnp->cn_flags & LOCKPARENT) == 0) {
 		VOP_UNLOCK(dvp);
 	}
 
 	return (cache_fpl_handled(fpl));
 }
 
 static int __noinline
 cache_fplookup_modifying(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 
 	ndp = fpl->ndp;
 
 	if (!cache_fpl_islastcn(ndp)) {
 		return (cache_fpl_partial(fpl));
 	}
 	return (cache_fplookup_final_modifying(fpl));
 }
 
 static int __noinline
 cache_fplookup_final_withparent(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	enum vgetstate dvs, tvs;
 	struct vnode *dvp, *tvp;
 	seqc_t dvp_seqc;
 	int error;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 	tvp = fpl->tvp;
 
 	MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0);
 
 	/*
 	 * This is less efficient than it can be for simplicity.
 	 */
 	dvs = vget_prep_smr(dvp);
 	if (__predict_false(dvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 	tvs = vget_prep_smr(tvp);
 	if (__predict_false(tvs == VGET_NONE)) {
 		cache_fpl_smr_exit(fpl);
 		vget_abort(dvp, dvs);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_smr_exit(fpl);
 
 	if ((cnp->cn_flags & LOCKPARENT) != 0) {
 		error = vget_finish(dvp, LK_EXCLUSIVE, dvs);
 		if (__predict_false(error != 0)) {
 			vget_abort(tvp, tvs);
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(dvp, dvs);
 	}
 
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vget_abort(tvp, tvs);
 		if ((cnp->cn_flags & LOCKPARENT) != 0)
 			vput(dvp);
 		else
 			vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	error = cache_fplookup_final_child(fpl, tvs);
 	if (__predict_false(error != 0)) {
 		MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED ||
 		    fpl->status == CACHE_FPL_STATUS_DESTROYED);
 		if ((cnp->cn_flags & LOCKPARENT) != 0)
 			vput(dvp);
 		else
 			vrele(dvp);
 		return (error);
 	}
 
 	MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED);
 	return (0);
 }
 
 static int
 cache_fplookup_final(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	enum vgetstate tvs;
 	struct vnode *dvp, *tvp;
 	seqc_t dvp_seqc;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 	tvp = fpl->tvp;
 
 	MPASS(*(cnp->cn_nameptr) != '/');
 
 	if (cnp->cn_nameiop != LOOKUP) {
 		return (cache_fplookup_final_modifying(fpl));
 	}
 
 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0)
 		return (cache_fplookup_final_withparent(fpl));
 
 	tvs = vget_prep_smr(tvp);
 	if (__predict_false(tvs == VGET_NONE)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		cache_fpl_smr_exit(fpl);
 		vget_abort(tvp, tvs);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	cache_fpl_smr_exit(fpl);
 	return (cache_fplookup_final_child(fpl, tvs));
 }
 
 /*
  * Comment from locked lookup:
  * Check for degenerate name (e.g. / or "") which is a way of talking about a
  * directory, e.g. like "/." or ".".
  */
 static int __noinline
 cache_fplookup_degenerate(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	struct vnode *dvp;
 	enum vgetstate dvs;
 	int error, lkflags;
 #ifdef INVARIANTS
 	char *cp;
 #endif
 
 	fpl->tvp = fpl->dvp;
 	fpl->tvp_seqc = fpl->dvp_seqc;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 #ifdef INVARIANTS
 	for (cp = cnp->cn_pnbuf; *cp != '\0'; cp++) {
 		KASSERT(*cp == '/',
 		    ("%s: encountered non-slash; string [%s]\n", __func__,
 		    cnp->cn_pnbuf));
 	}
 #endif
 
 	if (__predict_false(cnp->cn_nameiop != LOOKUP)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, EISDIR));
 	}
 
 	if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) {
 		return (cache_fplookup_final_withparent(fpl));
 	}
 
 	dvs = vget_prep_smr(dvp);
 	cache_fpl_smr_exit(fpl);
 	if (__predict_false(dvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
 		lkflags = LK_SHARED;
 		if ((cnp->cn_flags & LOCKSHARED) == 0)
 			lkflags = LK_EXCLUSIVE;
 		error = vget_finish(dvp, lkflags, dvs);
 		if (__predict_false(error != 0)) {
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(dvp, dvs);
 	}
 	return (cache_fpl_handled(fpl));
 }
 
 static int __noinline
 cache_fplookup_emptypath(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	enum vgetstate tvs;
 	struct vnode *tvp;
 	int error, lkflags;
 
 	fpl->tvp = fpl->dvp;
 	fpl->tvp_seqc = fpl->dvp_seqc;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	tvp = fpl->tvp;
 
 	MPASS(*cnp->cn_pnbuf == '\0');
 
 	if (__predict_false((cnp->cn_flags & EMPTYPATH) == 0)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENOENT));
 	}
 
 	MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
 
 	tvs = vget_prep_smr(tvp);
 	cache_fpl_smr_exit(fpl);
 	if (__predict_false(tvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
 		lkflags = LK_SHARED;
 		if ((cnp->cn_flags & LOCKSHARED) == 0)
 			lkflags = LK_EXCLUSIVE;
 		error = vget_finish(tvp, lkflags, tvs);
 		if (__predict_false(error != 0)) {
 			return (cache_fpl_aborted(fpl));
 		}
 	} else {
 		vget_finish_ref(tvp, tvs);
 	}
 
 	ndp->ni_resflags |= NIRES_EMPTYPATH;
 	return (cache_fpl_handled(fpl));
 }
 
 static int __noinline
 cache_fplookup_noentry(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	enum vgetstate dvs;
 	struct vnode *dvp, *tvp;
 	seqc_t dvp_seqc;
 	int error;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 
 	MPASS((cnp->cn_flags & MAKEENTRY) == 0);
 	MPASS((cnp->cn_flags & ISDOTDOT) == 0);
 	if (cnp->cn_nameiop == LOOKUP)
 		MPASS((cnp->cn_flags & NOCACHE) == 0);
 	MPASS(!cache_fpl_isdotdot(cnp));
 
 	/*
 	 * Hack: delayed name len checking.
 	 */
 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 	}
 
 	if (cnp->cn_nameptr[0] == '/') {
 		return (cache_fplookup_skip_slashes(fpl));
 	}
 
 	if (cnp->cn_pnbuf[0] == '\0') {
 		return (cache_fplookup_emptypath(fpl));
 	}
 
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (fpl->tvp == NULL) {
 			return (cache_fplookup_degenerate(fpl));
 		}
 		return (cache_fplookup_trailingslash(fpl));
 	}
 
 	if (cnp->cn_nameiop != LOOKUP) {
 		fpl->tvp = NULL;
 		return (cache_fplookup_modifying(fpl));
 	}
 
 	/*
 	 * Only try to fill in the component if it is the last one,
 	 * otherwise not only there may be several to handle but the
 	 * walk may be complicated.
 	 */
 	if (!cache_fpl_islastcn(ndp)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	/*
 	 * Regular lookup nulifies the slash, which we don't do here.
 	 * Don't take chances with filesystem routines seeing it for
 	 * the last entry.
 	 */
 	if (cache_fpl_istrailingslash(fpl)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	/*
 	 * Secure access to dvp; check cache_fplookup_partial_setup for
 	 * reasoning.
 	 */
 	dvs = vget_prep_smr(dvp);
 	cache_fpl_smr_exit(fpl);
 	if (__predict_false(dvs == VGET_NONE)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	vget_finish_ref(dvp, dvs);
 	if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 		vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	error = vn_lock(dvp, LK_SHARED);
 	if (__predict_false(error != 0)) {
 		vrele(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	tvp = NULL;
 	/*
 	 * TODO: provide variants which don't require locking either vnode.
 	 */
 	cnp->cn_flags |= ISLASTCN | MAKEENTRY;
 	cnp->cn_lkflags = LK_SHARED;
 	if ((cnp->cn_flags & LOCKSHARED) == 0) {
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 	}
 	error = VOP_LOOKUP(dvp, &tvp, cnp);
 	switch (error) {
 	case EJUSTRETURN:
 	case 0:
 		break;
 	case ENOTDIR:
 	case ENOENT:
 		vput(dvp);
 		return (cache_fpl_handled_error(fpl, error));
 	default:
 		vput(dvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	fpl->tvp = tvp;
 
 	if (tvp == NULL) {
 		MPASS(error == EJUSTRETURN);
 		if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 			vput(dvp);
 		} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 			VOP_UNLOCK(dvp);
 		}
 		return (cache_fpl_handled(fpl));
 	}
 
 	if (tvp->v_type == VLNK) {
 		if ((cnp->cn_flags & FOLLOW) != 0) {
 			vput(dvp);
 			vput(tvp);
 			return (cache_fpl_aborted(fpl));
 		}
 	}
 
 	if (__predict_false(cache_fplookup_is_mp(fpl))) {
 		vput(dvp);
 		vput(tvp);
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0) {
 		VOP_UNLOCK(tvp);
 	}
 
 	if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) {
 		vput(dvp);
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0) {
 		VOP_UNLOCK(dvp);
 	}
 	return (cache_fpl_handled(fpl));
 }
 
 static int __noinline
 cache_fplookup_dot(struct cache_fpl *fpl)
 {
 	int error;
 
 	MPASS(!seqc_in_modify(fpl->dvp_seqc));
 
 	if (__predict_false(fpl->dvp->v_type != VDIR)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENOTDIR));
 	}
 
 	/*
 	 * Just re-assign the value. seqc will be checked later for the first
 	 * non-dot path component in line and/or before deciding to return the
 	 * vnode.
 	 */
 	fpl->tvp = fpl->dvp;
 	fpl->tvp_seqc = fpl->dvp_seqc;
 
 	SDT_PROBE3(vfs, namecache, lookup, hit, fpl->dvp, ".", fpl->dvp);
 
 	error = 0;
 	if (cache_fplookup_is_mp(fpl)) {
 		error = cache_fplookup_cross_mount(fpl);
 	}
 	return (error);
 }
 
 static int __noinline
 cache_fplookup_dotdot(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct vnode *dvp;
 	struct prison *pr;
 	u_char nc_flag;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	MPASS(cache_fpl_isdotdot(cnp));
 
 	/*
 	 * XXX this is racy the same way regular lookup is
 	 */
 	for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 	    pr = pr->pr_parent)
 		if (dvp == pr->pr_root)
 			break;
 
 	if (dvp == ndp->ni_rootdir ||
 	    dvp == ndp->ni_topdir ||
 	    dvp == rootvnode ||
 	    pr != NULL) {
 		fpl->tvp = dvp;
 		fpl->tvp_seqc = vn_seqc_read_any(dvp);
 		if (seqc_in_modify(fpl->tvp_seqc)) {
 			return (cache_fpl_aborted(fpl));
 		}
 		return (0);
 	}
 
 	if ((dvp->v_vflag & VV_ROOT) != 0) {
 		/*
 		 * TODO
 		 * The opposite of climb mount is needed here.
 		 */
 		return (cache_fpl_partial(fpl));
 	}
 
 	if (__predict_false(dvp->v_type != VDIR)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENOTDIR));
 	}
 
 	ncp = atomic_load_consume_ptr(&dvp->v_cache_dd);
 	if (ncp == NULL) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
 		if ((nc_flag & NCF_NEGATIVE) != 0)
 			return (cache_fpl_aborted(fpl));
 		fpl->tvp = ncp->nc_vp;
 	} else {
 		fpl->tvp = ncp->nc_dvp;
 	}
 
 	fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp);
 	if (seqc_in_modify(fpl->tvp_seqc)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	/*
 	 * Acquire fence provided by vn_seqc_read_any above.
 	 */
 	if (__predict_false(atomic_load_ptr(&dvp->v_cache_dd) != ncp)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if (!cache_ncp_canuse(ncp)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	return (0);
 }
 
 static int __noinline
 cache_fplookup_neg(struct cache_fpl *fpl, struct namecache *ncp, uint32_t hash)
 {
 	u_char nc_flag __diagused;
 	bool neg_promote;
 
 #ifdef INVARIANTS
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	MPASS((nc_flag & NCF_NEGATIVE) != 0);
 #endif
 	/*
 	 * If they want to create an entry we need to replace this one.
 	 */
 	if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) {
 		fpl->tvp = NULL;
 		return (cache_fplookup_modifying(fpl));
 	}
 	neg_promote = cache_neg_hit_prep(ncp);
 	if (!cache_fpl_neg_ncp_canuse(ncp)) {
 		cache_neg_hit_abort(ncp);
 		return (cache_fpl_partial(fpl));
 	}
 	if (neg_promote) {
 		return (cache_fplookup_negative_promote(fpl, ncp, hash));
 	}
 	cache_neg_hit_finish(ncp);
 	cache_fpl_smr_exit(fpl);
 	return (cache_fpl_handled_error(fpl, ENOENT));
 }
 
 /*
  * Resolve a symlink. Called by filesystem-specific routines.
  *
  * Code flow is:
  * ... -> cache_fplookup_symlink -> VOP_FPLOOKUP_SYMLINK -> cache_symlink_resolve
  */
 int
 cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	size_t adjust;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	if (__predict_false(len == 0)) {
 		return (ENOENT);
 	}
 
 	if (__predict_false(len > MAXPATHLEN - 2)) {
 		if (cache_fpl_istrailingslash(fpl)) {
 			return (EAGAIN);
 		}
 	}
 
 	ndp->ni_pathlen = fpl->nulchar - cnp->cn_nameptr - cnp->cn_namelen + 1;
 #ifdef INVARIANTS
 	if (ndp->ni_pathlen != fpl->debug.ni_pathlen) {
 		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 		    __func__, ndp->ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 	}
 #endif
 
 	if (__predict_false(len + ndp->ni_pathlen > MAXPATHLEN)) {
 		return (ENAMETOOLONG);
 	}
 
 	if (__predict_false(ndp->ni_loopcnt++ >= MAXSYMLINKS)) {
 		return (ELOOP);
 	}
 
 	adjust = len;
 	if (ndp->ni_pathlen > 1) {
 		bcopy(ndp->ni_next, cnp->cn_pnbuf + len, ndp->ni_pathlen);
 	} else {
 		if (cache_fpl_istrailingslash(fpl)) {
 			adjust = len + 1;
 			cnp->cn_pnbuf[len] = '/';
 			cnp->cn_pnbuf[len + 1] = '\0';
 		} else {
 			cnp->cn_pnbuf[len] = '\0';
 		}
 	}
 	bcopy(string, cnp->cn_pnbuf, len);
 
 	ndp->ni_pathlen += adjust;
 	cache_fpl_pathlen_add(fpl, adjust);
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	fpl->nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 	fpl->tvp = NULL;
 	return (0);
 }
 
 static int __noinline
 cache_fplookup_symlink(struct cache_fpl *fpl)
 {
 	struct mount *mp;
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct vnode *dvp, *tvp;
 	struct pwd *pwd;
 	int error;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	tvp = fpl->tvp;
 	pwd = *(fpl->pwd);
 
 	if (cache_fpl_islastcn(ndp)) {
 		if ((cnp->cn_flags & FOLLOW) == 0) {
 			return (cache_fplookup_final(fpl));
 		}
 	}
 
 	mp = atomic_load_ptr(&dvp->v_mount);
 	if (__predict_false(mp == NULL)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	/*
 	 * Note this check races against setting the flag just like regular
 	 * lookup.
 	 */
 	if (__predict_false((mp->mnt_flag & MNT_NOSYMFOLLOW) != 0)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, EACCES));
 	}
 
 	error = VOP_FPLOOKUP_SYMLINK(tvp, fpl);
 	if (__predict_false(error != 0)) {
 		switch (error) {
 		case EAGAIN:
 			return (cache_fpl_partial(fpl));
 		case ENOENT:
 		case ENAMETOOLONG:
 		case ELOOP:
 			cache_fpl_smr_exit(fpl);
 			return (cache_fpl_handled_error(fpl, error));
 		default:
 			return (cache_fpl_aborted(fpl));
 		}
 	}
 
 	if (*(cnp->cn_nameptr) == '/') {
 		fpl->dvp = cache_fpl_handle_root(fpl);
 		fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 		if (seqc_in_modify(fpl->dvp_seqc)) {
 			return (cache_fpl_aborted(fpl));
 		}
 		/*
 		 * The main loop assumes that ->dvp points to a vnode belonging
 		 * to a filesystem which can do lockless lookup, but the absolute
 		 * symlink can be wandering off to one which does not.
 		 */
 		mp = atomic_load_ptr(&fpl->dvp->v_mount);
 		if (__predict_false(mp == NULL)) {
 			return (cache_fpl_aborted(fpl));
 		}
 		if (!cache_fplookup_mp_supported(mp)) {
 			cache_fpl_checkpoint(fpl);
 			return (cache_fpl_partial(fpl));
 		}
 		if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir)) {
 			return (cache_fpl_aborted(fpl));
 		}
 	}
 	return (0);
 }
 
 static int
 cache_fplookup_next(struct cache_fpl *fpl)
 {
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct vnode *dvp, *tvp;
 	u_char nc_flag;
 	uint32_t hash;
 	int error;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	hash = fpl->hash;
 
 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
 		if (cnp->cn_namelen == 1) {
 			return (cache_fplookup_dot(fpl));
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			return (cache_fplookup_dotdot(fpl));
 		}
 	}
 
 	MPASS(!cache_fpl_isdotdot(cnp));
 
 	CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	if (__predict_false(ncp == NULL)) {
 		return (cache_fplookup_noentry(fpl));
 	}
 
 	tvp = atomic_load_ptr(&ncp->nc_vp);
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_NEGATIVE) != 0) {
 		return (cache_fplookup_neg(fpl, ncp, hash));
 	}
 
 	if (!cache_ncp_canuse(ncp)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	fpl->tvp = tvp;
 	fpl->tvp_seqc = vn_seqc_read_any(tvp);
 	if (seqc_in_modify(fpl->tvp_seqc)) {
 		return (cache_fpl_partial(fpl));
 	}
 
 	counter_u64_add(numposhits, 1);
 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
 
 	error = 0;
 	if (cache_fplookup_is_mp(fpl)) {
 		error = cache_fplookup_cross_mount(fpl);
 	}
 	return (error);
 }
 
 static bool
 cache_fplookup_mp_supported(struct mount *mp)
 {
 
 	MPASS(mp != NULL);
 	if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
 		return (false);
 	return (true);
 }
 
 /*
  * Walk up the mount stack (if any).
  *
  * Correctness is provided in the following ways:
  * - all vnodes are protected from freeing with SMR
  * - struct mount objects are type stable making them always safe to access
  * - stability of the particular mount is provided by busying it
  * - relationship between the vnode which is mounted on and the mount is
  *   verified with the vnode sequence counter after busying
  * - association between root vnode of the mount and the mount is protected
  *   by busy
  *
  * From that point on we can read the sequence counter of the root vnode
  * and get the next mount on the stack (if any) using the same protection.
  *
  * By the end of successful walk we are guaranteed the reached state was
  * indeed present at least at some point which matches the regular lookup.
  */
 static int __noinline
 cache_fplookup_climb_mount(struct cache_fpl *fpl)
 {
 	struct mount *mp, *prev_mp;
 	struct mount_pcpu *mpcpu, *prev_mpcpu;
 	struct vnode *vp;
 	seqc_t vp_seqc;
 
 	vp = fpl->tvp;
 	vp_seqc = fpl->tvp_seqc;
 
 	VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 	mp = atomic_load_ptr(&vp->v_mountedhere);
 	if (__predict_false(mp == NULL)) {
 		return (0);
 	}
 
 	prev_mp = NULL;
 	for (;;) {
 		if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 			if (prev_mp != NULL)
 				vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 			return (cache_fpl_partial(fpl));
 		}
 		if (prev_mp != NULL)
 			vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 		if (!vn_seqc_consistent(vp, vp_seqc)) {
 			vfs_op_thread_exit_crit(mp, mpcpu);
 			return (cache_fpl_partial(fpl));
 		}
 		if (!cache_fplookup_mp_supported(mp)) {
 			vfs_op_thread_exit_crit(mp, mpcpu);
 			return (cache_fpl_partial(fpl));
 		}
 		vp = atomic_load_ptr(&mp->mnt_rootvnode);
 		if (vp == NULL) {
 			vfs_op_thread_exit_crit(mp, mpcpu);
 			return (cache_fpl_partial(fpl));
 		}
 		vp_seqc = vn_seqc_read_any(vp);
 		if (seqc_in_modify(vp_seqc)) {
 			vfs_op_thread_exit_crit(mp, mpcpu);
 			return (cache_fpl_partial(fpl));
 		}
 		prev_mp = mp;
 		prev_mpcpu = mpcpu;
 		mp = atomic_load_ptr(&vp->v_mountedhere);
 		if (mp == NULL)
 			break;
 	}
 
 	vfs_op_thread_exit_crit(prev_mp, prev_mpcpu);
 	fpl->tvp = vp;
 	fpl->tvp_seqc = vp_seqc;
 	return (0);
 }
 
 static int __noinline
 cache_fplookup_cross_mount(struct cache_fpl *fpl)
 {
 	struct mount *mp;
 	struct mount_pcpu *mpcpu;
 	struct vnode *vp;
 	seqc_t vp_seqc;
 
 	vp = fpl->tvp;
 	vp_seqc = fpl->tvp_seqc;
 
 	VNPASS(vp->v_type == VDIR || vp->v_type == VREG || vp->v_type == VBAD, vp);
 	mp = atomic_load_ptr(&vp->v_mountedhere);
 	if (__predict_false(mp == NULL)) {
 		return (0);
 	}
 
 	if (!vfs_op_thread_enter_crit(mp, mpcpu)) {
 		return (cache_fpl_partial(fpl));
 	}
 	if (!vn_seqc_consistent(vp, vp_seqc)) {
 		vfs_op_thread_exit_crit(mp, mpcpu);
 		return (cache_fpl_partial(fpl));
 	}
 	if (!cache_fplookup_mp_supported(mp)) {
 		vfs_op_thread_exit_crit(mp, mpcpu);
 		return (cache_fpl_partial(fpl));
 	}
 	vp = atomic_load_ptr(&mp->mnt_rootvnode);
 	if (__predict_false(vp == NULL)) {
 		vfs_op_thread_exit_crit(mp, mpcpu);
 		return (cache_fpl_partial(fpl));
 	}
 	vp_seqc = vn_seqc_read_any(vp);
 	vfs_op_thread_exit_crit(mp, mpcpu);
 	if (seqc_in_modify(vp_seqc)) {
 		return (cache_fpl_partial(fpl));
 	}
 	mp = atomic_load_ptr(&vp->v_mountedhere);
 	if (__predict_false(mp != NULL)) {
 		/*
 		 * There are possibly more mount points on top.
 		 * Normally this does not happen so for simplicity just start
 		 * over.
 		 */
 		return (cache_fplookup_climb_mount(fpl));
 	}
 
 	fpl->tvp = vp;
 	fpl->tvp_seqc = vp_seqc;
 	return (0);
 }
 
 /*
  * Check if a vnode is mounted on.
  */
 static bool
 cache_fplookup_is_mp(struct cache_fpl *fpl)
 {
 	struct vnode *vp;
 
 	vp = fpl->tvp;
 	return ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) != 0);
 }
 
 /*
  * Parse the path.
  *
  * The code was originally copy-pasted from regular lookup and despite
  * clean ups leaves performance on the table. Any modifications here
  * must take into account that in case off fallback the resulting
  * nameidata state has to be compatible with the original.
  */
 
 /*
  * Debug ni_pathlen tracking.
  */
 #ifdef INVARIANTS
 static void
 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 {
 
 	fpl->debug.ni_pathlen += n;
 	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 	    ("%s: pathlen overflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 }
 
 static void
 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 {
 
 	fpl->debug.ni_pathlen -= n;
 	KASSERT(fpl->debug.ni_pathlen <= PATH_MAX,
 	    ("%s: pathlen underflow to %zd\n", __func__, fpl->debug.ni_pathlen));
 }
 
 static void
 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 {
 
 	cache_fpl_pathlen_add(fpl, 1);
 }
 
 static void
 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 {
 
 	cache_fpl_pathlen_sub(fpl, 1);
 }
 #else
 static void
 cache_fpl_pathlen_add(struct cache_fpl *fpl, size_t n)
 {
 }
 
 static void
 cache_fpl_pathlen_sub(struct cache_fpl *fpl, size_t n)
 {
 }
 
 static void
 cache_fpl_pathlen_inc(struct cache_fpl *fpl)
 {
 }
 
 static void
 cache_fpl_pathlen_dec(struct cache_fpl *fpl)
 {
 }
 #endif
 
 static void
 cache_fplookup_parse(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct vnode *dvp;
 	char *cp;
 	uint32_t hash;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 
 	/*
 	 * Find the end of this path component, it is either / or nul.
 	 *
 	 * Store / as a temporary sentinel so that we only have one character
 	 * to test for. Pathnames tend to be short so this should not be
 	 * resulting in cache misses.
 	 *
 	 * TODO: fix this to be word-sized.
 	 */
 	MPASS(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] >= cnp->cn_pnbuf);
 	KASSERT(&cnp->cn_nameptr[fpl->debug.ni_pathlen - 1] == fpl->nulchar,
 	    ("%s: mismatch between pathlen (%zu) and nulchar (%p != %p), string [%s]\n",
 	    __func__, fpl->debug.ni_pathlen, &cnp->cn_nameptr[fpl->debug.ni_pathlen - 1],
 	    fpl->nulchar, cnp->cn_pnbuf));
 	KASSERT(*fpl->nulchar == '\0',
 	    ("%s: expected nul at %p; string [%s]\n", __func__, fpl->nulchar,
 	    cnp->cn_pnbuf));
 	hash = cache_get_hash_iter_start(dvp);
 	*fpl->nulchar = '/';
 	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
 		KASSERT(*cp != '\0',
 		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
 		    cnp->cn_nameptr));
 		hash = cache_get_hash_iter(*cp, hash);
 		continue;
 	}
 	*fpl->nulchar = '\0';
 	fpl->hash = cache_get_hash_iter_finish(hash);
 
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	cache_fpl_pathlen_sub(fpl, cnp->cn_namelen);
 
 #ifdef INVARIANTS
 	/*
 	 * cache_get_hash only accepts lengths up to NAME_MAX. This is fine since
 	 * we are going to fail this lookup with ENAMETOOLONG (see below).
 	 */
 	if (cnp->cn_namelen <= NAME_MAX) {
 		if (fpl->hash != cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp)) {
 			panic("%s: mismatched hash for [%s] len %ld", __func__,
 			    cnp->cn_nameptr, cnp->cn_namelen);
 		}
 	}
 #endif
 
 	/*
 	 * Hack: we have to check if the found path component's length exceeds
 	 * NAME_MAX. However, the condition is very rarely true and check can
 	 * be elided in the common case -- if an entry was found in the cache,
 	 * then it could not have been too long to begin with.
 	 */
 	ndp->ni_next = cp;
 }
 
 static void
 cache_fplookup_parse_advance(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	cnp->cn_nameptr = ndp->ni_next;
 	KASSERT(*(cnp->cn_nameptr) == '/',
 	    ("%s: should have seen slash at %p ; buf %p [%s]\n", __func__,
 	    cnp->cn_nameptr, cnp->cn_pnbuf, cnp->cn_pnbuf));
 	cnp->cn_nameptr++;
 	cache_fpl_pathlen_dec(fpl);
 }
 
 /*
  * Skip spurious slashes in a pathname (e.g., "foo///bar") and retry.
  *
  * Lockless lookup tries to elide checking for spurious slashes and should they
  * be present is guaranteed to fail to find an entry. In this case the caller
  * must check if the name starts with a slash and call this routine.  It is
  * going to fast forward across the spurious slashes and set the state up for
  * retry.
  */
 static int __noinline
 cache_fplookup_skip_slashes(struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	MPASS(*(cnp->cn_nameptr) == '/');
 	do {
 		cnp->cn_nameptr++;
 		cache_fpl_pathlen_dec(fpl);
 	} while (*(cnp->cn_nameptr) == '/');
 
 	/*
 	 * Go back to one slash so that cache_fplookup_parse_advance has
 	 * something to skip.
 	 */
 	cnp->cn_nameptr--;
 	cache_fpl_pathlen_inc(fpl);
 
 	/*
 	 * cache_fplookup_parse_advance starts from ndp->ni_next
 	 */
 	ndp->ni_next = cnp->cn_nameptr;
 
 	/*
 	 * See cache_fplookup_dot.
 	 */
 	fpl->tvp = fpl->dvp;
 	fpl->tvp_seqc = fpl->dvp_seqc;
 
 	return (0);
 }
 
 /*
  * Handle trailing slashes (e.g., "foo/").
  *
  * If a trailing slash is found the terminal vnode must be a directory.
  * Regular lookup shortens the path by nulifying the first trailing slash and
  * sets the TRAILINGSLASH flag to denote this took place. There are several
  * checks on it performed later.
  *
  * Similarly to spurious slashes, lockless lookup handles this in a speculative
  * manner relying on an invariant that a non-directory vnode will get a miss.
  * In this case cn_nameptr[0] == '\0' and cn_namelen == 0.
  *
  * Thus for a path like "foo/bar/" the code unwinds the state back to "bar/"
  * and denotes this is the last path component, which avoids looping back.
  *
  * Only plain lookups are supported for now to restrict corner cases to handle.
  */
 static int __noinline
 cache_fplookup_trailingslash(struct cache_fpl *fpl)
 {
 #ifdef INVARIANTS
 	size_t ni_pathlen;
 #endif
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct namecache *ncp;
 	struct vnode *tvp;
 	char *cn_nameptr_orig, *cn_nameptr_slash;
 	seqc_t tvp_seqc;
 	u_char nc_flag;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 	tvp = fpl->tvp;
 	tvp_seqc = fpl->tvp_seqc;
 
 	MPASS(fpl->dvp == fpl->tvp);
 	KASSERT(cache_fpl_istrailingslash(fpl),
 	    ("%s: expected trailing slash at %p; string [%s]\n", __func__, fpl->nulchar - 1,
 	    cnp->cn_pnbuf));
 	KASSERT(cnp->cn_nameptr[0] == '\0',
 	    ("%s: expected nul char at %p; string [%s]\n", __func__, &cnp->cn_nameptr[0],
 	    cnp->cn_pnbuf));
 	KASSERT(cnp->cn_namelen == 0,
 	    ("%s: namelen 0 but got %ld; string [%s]\n", __func__, cnp->cn_namelen,
 	    cnp->cn_pnbuf));
 	MPASS(cnp->cn_nameptr > cnp->cn_pnbuf);
 
 	if (cnp->cn_nameiop != LOOKUP) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	if (__predict_false(tvp->v_type != VDIR)) {
 		if (!vn_seqc_consistent(tvp, tvp_seqc)) {
 			return (cache_fpl_aborted(fpl));
 		}
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENOTDIR));
 	}
 
 	/*
 	 * Denote the last component.
 	 */
 	ndp->ni_next = &cnp->cn_nameptr[0];
 	MPASS(cache_fpl_islastcn(ndp));
 
 	/*
 	 * Unwind trailing slashes.
 	 */
 	cn_nameptr_orig = cnp->cn_nameptr;
 	while (cnp->cn_nameptr >= cnp->cn_pnbuf) {
 		cnp->cn_nameptr--;
 		if (cnp->cn_nameptr[0] != '/') {
 			break;
 		}
 	}
 
 	/*
 	 * Unwind to the beginning of the path component.
 	 *
 	 * Note the path may or may not have started with a slash.
 	 */
 	cn_nameptr_slash = cnp->cn_nameptr;
 	while (cnp->cn_nameptr > cnp->cn_pnbuf) {
 		cnp->cn_nameptr--;
 		if (cnp->cn_nameptr[0] == '/') {
 			break;
 		}
 	}
 	if (cnp->cn_nameptr[0] == '/') {
 		cnp->cn_nameptr++;
 	}
 
 	cnp->cn_namelen = cn_nameptr_slash - cnp->cn_nameptr + 1;
 	cache_fpl_pathlen_add(fpl, cn_nameptr_orig - cnp->cn_nameptr);
 	cache_fpl_checkpoint(fpl);
 
 #ifdef INVARIANTS
 	ni_pathlen = fpl->nulchar - cnp->cn_nameptr + 1;
 	if (ni_pathlen != fpl->debug.ni_pathlen) {
 		panic("%s: mismatch (%zu != %zu) nulchar %p nameptr %p [%s] ; full string [%s]\n",
 		    __func__, ni_pathlen, fpl->debug.ni_pathlen, fpl->nulchar,
 		    cnp->cn_nameptr, cnp->cn_nameptr, cnp->cn_pnbuf);
 	}
 #endif
 
 	/*
 	 * If this was a "./" lookup the parent directory is already correct.
 	 */
 	if (cnp->cn_nameptr[0] == '.' && cnp->cn_namelen == 1) {
 		return (0);
 	}
 
 	/*
 	 * Otherwise we need to look it up.
 	 */
 	tvp = fpl->tvp;
 	ncp = atomic_load_consume_ptr(&tvp->v_cache_dd);
 	if (__predict_false(ncp == NULL)) {
 		return (cache_fpl_aborted(fpl));
 	}
 	nc_flag = atomic_load_char(&ncp->nc_flag);
 	if ((nc_flag & NCF_ISDOTDOT) != 0) {
 		return (cache_fpl_aborted(fpl));
 	}
 	fpl->dvp = ncp->nc_dvp;
 	fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp);
 	if (seqc_in_modify(fpl->dvp_seqc)) {
 		return (cache_fpl_aborted(fpl));
 	}
 	return (0);
 }
 
 /*
  * See the API contract for VOP_FPLOOKUP_VEXEC.
  */
 static int __noinline
 cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error)
 {
 	struct componentname *cnp;
 	struct vnode *dvp;
 	seqc_t dvp_seqc;
 
 	cnp = fpl->cnp;
 	dvp = fpl->dvp;
 	dvp_seqc = fpl->dvp_seqc;
 
 	/*
 	 * Hack: delayed empty path checking.
 	 */
 	if (cnp->cn_pnbuf[0] == '\0') {
 		return (cache_fplookup_emptypath(fpl));
 	}
 
 	/*
 	 * TODO: Due to ignoring trailing slashes lookup will perform a
 	 * permission check on the last dir when it should not be doing it.  It
 	 * may fail, but said failure should be ignored. It is possible to fix
 	 * it up fully without resorting to regular lookup, but for now just
 	 * abort.
 	 */
 	if (cache_fpl_istrailingslash(fpl)) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	/*
 	 * Hack: delayed degenerate path checking.
 	 */
 	if (cnp->cn_nameptr[0] == '\0' && fpl->tvp == NULL) {
 		return (cache_fplookup_degenerate(fpl));
 	}
 
 	/*
 	 * Hack: delayed name len checking.
 	 */
 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
 		cache_fpl_smr_exit(fpl);
 		return (cache_fpl_handled_error(fpl, ENAMETOOLONG));
 	}
 
 	/*
 	 * Hack: they may be looking up foo/bar, where foo is not a directory.
 	 * In such a case we need to return ENOTDIR, but we may happen to get
 	 * here with a different error.
 	 */
 	if (dvp->v_type != VDIR) {
 		error = ENOTDIR;
 	}
 
 	/*
 	 * Hack: handle O_SEARCH.
 	 *
 	 * Open Group Base Specifications Issue 7, 2018 edition states:
 	 * <quote>
 	 * If the access mode of the open file description associated with the
 	 * file descriptor is not O_SEARCH, the function shall check whether
 	 * directory searches are permitted using the current permissions of
 	 * the directory underlying the file descriptor. If the access mode is
 	 * O_SEARCH, the function shall not perform the check.
 	 * </quote>
 	 *
 	 * Regular lookup tests for the NOEXECCHECK flag for every path
 	 * component to decide whether to do the permission check. However,
 	 * since most lookups never have the flag (and when they do it is only
 	 * present for the first path component), lockless lookup only acts on
 	 * it if there is a permission problem. Here the flag is represented
 	 * with a boolean so that we don't have to clear it on the way out.
 	 *
 	 * For simplicity this always aborts.
 	 * TODO: check if this is the first lookup and ignore the permission
 	 * problem. Note the flag has to survive fallback (if it happens to be
 	 * performed).
 	 */
 	if (fpl->fsearch) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	switch (error) {
 	case EAGAIN:
 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 			error = cache_fpl_aborted(fpl);
 		} else {
 			cache_fpl_partial(fpl);
 		}
 		break;
 	default:
 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 			error = cache_fpl_aborted(fpl);
 		} else {
 			cache_fpl_smr_exit(fpl);
 			cache_fpl_handled_error(fpl, error);
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
 {
 	struct nameidata *ndp;
 	struct componentname *cnp;
 	struct mount *mp;
 	int error;
 
 	ndp = fpl->ndp;
 	cnp = fpl->cnp;
 
 	cache_fpl_checkpoint(fpl);
 
 	/*
 	 * The vnode at hand is almost always stable, skip checking for it.
 	 * Worst case this postpones the check towards the end of the iteration
 	 * of the main loop.
 	 */
 	fpl->dvp = dvp;
 	fpl->dvp_seqc = vn_seqc_read_notmodify(fpl->dvp);
 
 	mp = atomic_load_ptr(&dvp->v_mount);
 	if (__predict_false(mp == NULL || !cache_fplookup_mp_supported(mp))) {
 		return (cache_fpl_aborted(fpl));
 	}
 
 	MPASS(fpl->tvp == NULL);
 
 	for (;;) {
 		cache_fplookup_parse(fpl);
 
 		error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred);
 		if (__predict_false(error != 0)) {
 			error = cache_fplookup_failed_vexec(fpl, error);
 			break;
 		}
 
 		error = cache_fplookup_next(fpl);
 		if (__predict_false(cache_fpl_terminated(fpl))) {
 			break;
 		}
 
 		VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
 
 		if (fpl->tvp->v_type == VLNK) {
 			error = cache_fplookup_symlink(fpl);
 			if (cache_fpl_terminated(fpl)) {
 				break;
 			}
 		} else {
 			if (cache_fpl_islastcn(ndp)) {
 				error = cache_fplookup_final(fpl);
 				break;
 			}
 
 			if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) {
 				error = cache_fpl_aborted(fpl);
 				break;
 			}
 
 			fpl->dvp = fpl->tvp;
 			fpl->dvp_seqc = fpl->tvp_seqc;
 			cache_fplookup_parse_advance(fpl);
 		}
 
 		cache_fpl_checkpoint(fpl);
 	}
 
 	return (error);
 }
 
 /*
  * Fast path lookup protected with SMR and sequence counters.
  *
  * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
  *
  * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
  * outlined below.
  *
  * Traditional vnode lookup conceptually looks like this:
  *
  * vn_lock(current);
  * for (;;) {
  *	next = find();
  *	vn_lock(next);
  *	vn_unlock(current);
  *	current = next;
  *	if (last)
  *	    break;
  * }
  * return (current);
  *
  * Each jump to the next vnode is safe memory-wise and atomic with respect to
  * any modifications thanks to holding respective locks.
  *
  * The same guarantee can be provided with a combination of safe memory
  * reclamation and sequence counters instead. If all operations which affect
  * the relationship between the current vnode and the one we are looking for
  * also modify the counter, we can verify whether all the conditions held as
  * we made the jump. This includes things like permissions, mount points etc.
  * Counter modification is provided by enclosing relevant places in
  * vn_seqc_write_begin()/end() calls.
  *
  * Thus this translates to:
  *
  * vfs_smr_enter();
  * dvp_seqc = seqc_read_any(dvp);
  * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode
  *     abort();
  * for (;;) {
  * 	tvp = find();
  * 	tvp_seqc = seqc_read_any(tvp);
  * 	if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode
  * 	    abort();
  * 	if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode
  * 	    abort();
  * 	dvp = tvp; // we know nothing of importance has changed
  * 	dvp_seqc = tvp_seqc; // store the counter for the tvp iteration
  * 	if (last)
  * 	    break;
  * }
  * vget(); // secure the vnode
  * if (!seqc_consistent(tvp, tvp_seqc) // final check
  * 	    abort();
  * // at this point we know nothing has changed for any parent<->child pair
  * // as they were crossed during the lookup, meaning we matched the guarantee
  * // of the locked variant
  * return (tvp);
  *
  * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
  * - they are called while within vfs_smr protection which they must never exit
  * - EAGAIN can be returned to denote checking could not be performed, it is
  *   always valid to return it
  * - if the sequence counter has not changed the result must be valid
  * - if the sequence counter has changed both false positives and false negatives
  *   are permitted (since the result will be rejected later)
  * - for simple cases of unix permission checks vaccess_vexec_smr can be used
  *
  * Caveats to watch out for:
  * - vnodes are passed unlocked and unreferenced with nothing stopping
  *   VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
  *   to use atomic_load_ptr to fetch it.
  * - the aforementioned object can also get freed, meaning absent other means it
  *   should be protected with vfs_smr
  * - either safely checking permissions as they are modified or guaranteeing
  *   their stability is left to the routine
  */
 int
 cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status,
     struct pwd **pwdp)
 {
 	struct cache_fpl fpl;
 	struct pwd *pwd;
 	struct vnode *dvp;
 	struct componentname *cnp;
 	int error;
 
 	fpl.status = CACHE_FPL_STATUS_UNSET;
 	fpl.in_smr = false;
 	fpl.ndp = ndp;
 	fpl.cnp = cnp = &ndp->ni_cnd;
 	MPASS(ndp->ni_lcf == 0);
 	KASSERT ((cnp->cn_flags & CACHE_FPL_INTERNAL_CN_FLAGS) == 0,
 	    ("%s: internal flags found in cn_flags %" PRIx64, __func__,
 	    cnp->cn_flags));
 	MPASS(cnp->cn_nameptr == cnp->cn_pnbuf);
 	MPASS(ndp->ni_resflags == 0);
 
 	if (__predict_false(!cache_can_fplookup(&fpl))) {
 		*status = fpl.status;
 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 		return (EOPNOTSUPP);
 	}
 
 	cache_fpl_checkpoint_outer(&fpl);
 
 	cache_fpl_smr_enter_initial(&fpl);
 #ifdef INVARIANTS
 	fpl.debug.ni_pathlen = ndp->ni_pathlen;
 #endif
 	fpl.nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
 	fpl.fsearch = false;
 	fpl.tvp = NULL; /* for degenerate path handling */
 	fpl.pwd = pwdp;
 	pwd = pwd_get_smr();
 	*(fpl.pwd) = pwd;
 	namei_setup_rootdir(ndp, cnp, pwd);
 	ndp->ni_topdir = pwd->pwd_jdir;
 
 	if (cnp->cn_pnbuf[0] == '/') {
 		dvp = cache_fpl_handle_root(&fpl);
 		ndp->ni_resflags = NIRES_ABS;
 	} else {
 		if (ndp->ni_dirfd == AT_FDCWD) {
 			dvp = pwd->pwd_cdir;
 		} else {
 			error = cache_fplookup_dirfd(&fpl, &dvp);
 			if (__predict_false(error != 0)) {
 				goto out;
 			}
 		}
 	}
 
 	SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true);
 	error = cache_fplookup_impl(dvp, &fpl);
 out:
 	cache_fpl_smr_assert_not_entered(&fpl);
 	cache_fpl_assert_status(&fpl);
 	*status = fpl.status;
 	if (SDT_PROBES_ENABLED()) {
 		SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status);
 		if (fpl.status == CACHE_FPL_STATUS_HANDLED)
 			SDT_PROBE4(vfs, namei, lookup, return, error, ndp->ni_vp, true,
 			    ndp);
 	}
 
 	if (__predict_true(fpl.status == CACHE_FPL_STATUS_HANDLED)) {
 		MPASS(error != CACHE_FPL_FAILED);
 		if (error != 0) {
 			cache_fpl_cleanup_cnp(fpl.cnp);
 			MPASS(fpl.dvp == NULL);
 			MPASS(fpl.tvp == NULL);
 		}
 		ndp->ni_dvp = fpl.dvp;
 		ndp->ni_vp = fpl.tvp;
 	}
 	return (error);
 }
diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
index 5fce4b8e1713..f08080d4e4fa 100644
--- a/sys/sys/sysctl.h
+++ b/sys/sys/sysctl.h
@@ -1,1221 +1,1242 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sysctl.h	8.1 (Berkeley) 6/2/93
  */
 
 #ifndef _SYS_SYSCTL_H_
 #define	_SYS_SYSCTL_H_
 
 #ifdef _KERNEL
+#include <sys/cdefs.h>
 #include <sys/queue.h>
 #include <sys/tree.h>
 #endif
 
 /*
  * Definitions for sysctl call.  The sysctl call uses a hierarchical name
  * for objects that can be examined or modified.  The name is expressed as
  * a sequence of integers.  Like a file path name, the meaning of each
  * component depends on its place in the hierarchy.  The top-level and kern
  * identifiers are defined here, and other identifiers are defined in the
  * respective subsystem header files.
  *
  * Each subsystem defined by sysctl defines a list of variables for that
  * subsystem. Each name is either a node with further levels defined below it,
  * or it is a leaf of some particular type given below. Each sysctl level
  * defines a set of name/type pairs to be used by sysctl(8) in manipulating the
  * subsystem.
  */
 
 #define	CTL_MAXNAME	24	/* largest number of components supported */
 
 #define	CTLTYPE		0xf	/* mask for the type */
 #define	CTLTYPE_NODE	1	/* name is a node */
 #define	CTLTYPE_INT	2	/* name describes an integer */
 #define	CTLTYPE_STRING	3	/* name describes a string */
 #define	CTLTYPE_S64	4	/* name describes a signed 64-bit number */
 #define	CTLTYPE_OPAQUE	5	/* name describes a structure */
 #define	CTLTYPE_STRUCT	CTLTYPE_OPAQUE	/* name describes a structure */
 #define	CTLTYPE_UINT	6	/* name describes an unsigned integer */
 #define	CTLTYPE_LONG	7	/* name describes a long */
 #define	CTLTYPE_ULONG	8	/* name describes an unsigned long */
 #define	CTLTYPE_U64	9	/* name describes an unsigned 64-bit number */
 #define	CTLTYPE_U8	0xa	/* name describes an unsigned 8-bit number */
 #define	CTLTYPE_U16	0xb	/* name describes an unsigned 16-bit number */
 #define	CTLTYPE_S8	0xc	/* name describes a signed 8-bit number */
 #define	CTLTYPE_S16	0xd	/* name describes a signed 16-bit number */
 #define	CTLTYPE_S32	0xe	/* name describes a signed 32-bit number */
 #define	CTLTYPE_U32	0xf	/* name describes an unsigned 32-bit number */
 
 #define	CTLFLAG_RD	0x80000000	/* Allow reads of variable */
 #define	CTLFLAG_WR	0x40000000	/* Allow writes to the variable */
 #define	CTLFLAG_RW	(CTLFLAG_RD|CTLFLAG_WR)
 #define	CTLFLAG_DORMANT	0x20000000	/* This sysctl is not active yet */
 #define	CTLFLAG_ANYBODY	0x10000000	/* All users can set this var */
 #define	CTLFLAG_SECURE	0x08000000	/* Permit set only if securelevel<=0 */
 #define	CTLFLAG_PRISON	0x04000000	/* Prisoned roots can fiddle */
 #define	CTLFLAG_DYN	0x02000000	/* Dynamic oid - can be freed */
 #define	CTLFLAG_SKIP	0x01000000	/* Skip this sysctl when listing */
 #define	CTLMASK_SECURE	0x00F00000	/* Secure level */
 #define	CTLFLAG_TUN	0x00080000	/* Default value is loaded from getenv() */
 #define	CTLFLAG_RDTUN	(CTLFLAG_RD|CTLFLAG_TUN)
 #define	CTLFLAG_RWTUN	(CTLFLAG_RW|CTLFLAG_TUN)
 #define	CTLFLAG_MPSAFE	0x00040000	/* Handler is MP safe */
 #define	CTLFLAG_VNET	0x00020000	/* Prisons with vnet can fiddle */
 #define	CTLFLAG_DYING	0x00010000	/* Oid is being removed */
 #define	CTLFLAG_CAPRD	0x00008000	/* Can be read in capability mode */
 #define	CTLFLAG_CAPWR	0x00004000	/* Can be written in capability mode */
 #define	CTLFLAG_STATS	0x00002000	/* Statistics, not a tuneable */
 #define	CTLFLAG_NOFETCH	0x00001000	/* Don't fetch tunable from getenv() */
 #define	CTLFLAG_CAPRW	(CTLFLAG_CAPRD|CTLFLAG_CAPWR)
 /*
  * This is transient flag to be used until all sysctl handlers are converted
  * to not lock Giant.
  * One, and only one of CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT is required
  * for SYSCTL_PROC and SYSCTL_NODE.
  */
 #define	CTLFLAG_NEEDGIANT 0x00000800	/* Handler require Giant */
 
 /*
  * Secure level.   Note that CTLFLAG_SECURE == CTLFLAG_SECURE1.
  *
  * Secure when the securelevel is raised to at least N.
  */
 #define	CTLSHIFT_SECURE	20
 #define	CTLFLAG_SECURE1	(CTLFLAG_SECURE | (0 << CTLSHIFT_SECURE))
 #define	CTLFLAG_SECURE2	(CTLFLAG_SECURE | (1 << CTLSHIFT_SECURE))
 #define	CTLFLAG_SECURE3	(CTLFLAG_SECURE | (2 << CTLSHIFT_SECURE))
 
 /*
  * USE THIS instead of a hardwired number from the categories below
  * to get dynamically assigned sysctl entries using the linker-set
  * technology. This is the way nearly all new sysctl variables should
  * be implemented.
  * e.g. SYSCTL_INT(_parent, OID_AUTO, name, CTLFLAG_RW, &variable, 0, "");
  */
 #define	OID_AUTO	(-1)
 
 /*
  * The starting number for dynamically-assigned entries.  WARNING!
  * ALL static sysctl entries should have numbers LESS than this!
  */
 #define	CTL_AUTO_START	0x100
 
 #ifdef _KERNEL
 #include <sys/linker_set.h>
 
 #ifdef KLD_MODULE
 /* XXX allow overspecification of type in external kernel modules */
 #define	SYSCTL_CT_ASSERT_MASK CTLTYPE
 #else
 #define	SYSCTL_CT_ASSERT_MASK 0
 #endif
 
 #define	SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1,	\
 	intmax_t arg2, struct sysctl_req *req
 
 /* definitions for sysctl_req 'lock' member */
 #define	REQ_UNWIRED	1
 #define	REQ_WIRED	2
 
 /* definitions for sysctl_req 'flags' member */
 #ifdef COMPAT_FREEBSD32
 #define	SCTL_MASK32	1	/* 32 bit emulation */
 #endif
 
 /*
  * This describes the access space for a sysctl request.  This is needed
  * so that we can use the interface from the kernel or from user-space.
  */
 struct thread;
 struct sysctl_req {
 	struct thread	*td;		/* used for access checking */
 	int		 lock;		/* wiring state */
 	void		*oldptr;
 	size_t		 oldlen;
 	size_t		 oldidx;
 	int		(*oldfunc)(struct sysctl_req *, const void *, size_t);
 	const void		*newptr;
 	size_t		 newlen;
 	size_t		 newidx;
 	int		(*newfunc)(struct sysctl_req *, void *, size_t);
 	size_t		 validlen;
 	int		 flags;
 };
 
 struct sysctl_oid;
 
 /* RB Tree handling */
 RB_HEAD(sysctl_oid_list, sysctl_oid);
 
 /*
  * This describes one "oid" in the MIB tree.  Potentially more nodes can
  * be hidden behind it, expanded by the handler.
  */
 struct sysctl_oid {
 	struct sysctl_oid_list	oid_children;
 	struct sysctl_oid_list*	oid_parent;
 	RB_ENTRY(sysctl_oid) oid_link;
 	/* Sort key for all siblings, and lookup key for userland */
 	int		 oid_number;
 	u_int		 oid_kind;
 	void		*oid_arg1;
 	intmax_t	 oid_arg2;
 	/* Must be unique amongst all siblings. */
 	const char	*oid_name;
 	int		(*oid_handler)(SYSCTL_HANDLER_ARGS);
 	const char	*oid_fmt;
 	int		 oid_refcnt;
 	u_int		 oid_running;
 	const char	*oid_descr;
 	const char	*oid_label;
 };
 
 static inline int
 cmp_sysctl_oid(struct sysctl_oid *a, struct sysctl_oid *b)
 {
 	if (a->oid_number > b->oid_number)
 		return (1);
 	else if (a->oid_number < b->oid_number)
 		return (-1);
 	else
 		return (0);
 }
 
 RB_PROTOTYPE(sysctl_oid_list, sysctl_oid, oid_link, cmp_sysctl_oid);
 
 #define	SYSCTL_IN(r, p, l)	(r->newfunc)(r, p, l)
 #define	SYSCTL_OUT(r, p, l)	(r->oldfunc)(r, p, l)
 #define	SYSCTL_OUT_STR(r, p)	(r->oldfunc)(r, p, strlen(p) + 1)
 
 int sysctl_handle_bool(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_8(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_16(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_32(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_int(SYSCTL_HANDLER_ARGS);
 int sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_long(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_string(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_opaque(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_counter_u64_array(SYSCTL_HANDLER_ARGS);
 
 int sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS);
 int sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS);
 
 int sysctl_msec_to_sbintime(SYSCTL_HANDLER_ARGS);
 int sysctl_usec_to_sbintime(SYSCTL_HANDLER_ARGS);
 int sysctl_sec_to_timeval(SYSCTL_HANDLER_ARGS);
 
 int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS);
 int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS);
 
 /*
  * These functions are used to add/remove an oid from the mib.
  */
 void sysctl_register_oid(struct sysctl_oid *oidp);
 void sysctl_register_disabled_oid(struct sysctl_oid *oidp);
 void sysctl_enable_oid(struct sysctl_oid *oidp);
 void sysctl_unregister_oid(struct sysctl_oid *oidp);
 
 /* Declare a static oid to allow child oids to be added to it. */
 #define	SYSCTL_DECL(name)			\
 	extern struct sysctl_oid sysctl__##name
 
 /* Hide these in macros. */
 #define	SYSCTL_CHILDREN(oid_ptr)		(&(oid_ptr)->oid_children)
 #define	SYSCTL_PARENT(oid_ptr)					\
     (((oid_ptr)->oid_parent != &sysctl__children) ?		\
 	__containerof((oid_ptr)->oid_parent, struct sysctl_oid,	\
 	oid_children) : (struct sysctl_oid *)NULL)
 #define	SYSCTL_STATIC_CHILDREN(oid_name)	(&sysctl__##oid_name.oid_children)
 
 /* === Structs and macros related to context handling. === */
 
 /* All dynamically created sysctls can be tracked in a context list. */
 struct sysctl_ctx_entry {
 	struct sysctl_oid *entry;
 	TAILQ_ENTRY(sysctl_ctx_entry) link;
 };
 
 TAILQ_HEAD(sysctl_ctx_list, sysctl_ctx_entry);
 
 #define	SYSCTL_NODE_CHILDREN(parent, name) \
 	sysctl__##parent##_##name.oid_children
 
 #ifndef NO_SYSCTL_DESCR
 #define	__DESCR(d) d
 #else
 #define	__DESCR(d) ""
 #endif
 
 #ifdef	notyet
 #define	SYSCTL_ENFORCE_FLAGS(x)						\
     _Static_assert((((x) & CTLFLAG_MPSAFE) != 0) ^ (((x) & CTLFLAG_NEEDGIANT) != 0), \
         "Has to be either CTLFLAG_MPSAFE or CTLFLAG_NEEDGIANT")
 #else
 #define	SYSCTL_ENFORCE_FLAGS(x)
 #endif
 
 /* This macro is only for internal use */
 #define	SYSCTL_OID_RAW(id, parent_child_head, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
 	struct sysctl_oid id = {					\
 		.oid_parent = (parent_child_head),			\
 		.oid_children = RB_INITIALIZER(&id.oid_children), \
 		.oid_number = (nbr),					\
 		.oid_kind = (kind),					\
 		.oid_arg1 = (a1),					\
 		.oid_arg2 = (a2),					\
 		.oid_name = (name),					\
 		.oid_handler = (handler),				\
 		.oid_fmt = (fmt),					\
 		.oid_descr = __DESCR(descr),				\
 		.oid_label = (label),					\
 	};								\
 	DATA_SET(sysctl_set, id);					\
 	SYSCTL_ENFORCE_FLAGS(kind)
 
 /* This constructs a static "raw" MIB oid. */
 #define	SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 	SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2,		\
 	    handler, fmt, descr, NULL)
 
 #define	SYSCTL_OID_WITH_LABEL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
     static SYSCTL_OID_RAW(sysctl__##parent##_##name,			\
 	SYSCTL_CHILDREN(&sysctl__##parent),				\
 	nbr, #name, kind, a1, a2, handler, fmt, descr, label)
 
 /* This constructs a global "raw" MIB oid. */
 #define	SYSCTL_OID_GLOBAL(parent, nbr, name, kind, a1, a2, handler, fmt, descr, label) \
     SYSCTL_OID_RAW(sysctl__##parent##_##name, \
 	SYSCTL_CHILDREN(&sysctl__##parent),	\
 	nbr, #name, kind, a1, a2, handler, fmt, descr, label)
 
 #define	SYSCTL_ADD_OID(ctx, parent, nbr, name, kind, a1, a2, handler, fmt, descr) \
 ({									\
 	SYSCTL_ENFORCE_FLAGS(kind);					\
 	sysctl_add_oid(ctx, parent, nbr, name, kind, a1, a2,handler,	\
 	    fmt, __DESCR(descr), NULL);					\
 })
 
 /* This constructs a root node from which other nodes can hang. */
 #define	SYSCTL_ROOT_NODE(nbr, name, access, handler, descr)	\
 	SYSCTL_OID_RAW(sysctl___##name, &sysctl__children,	\
 	    nbr, #name, CTLTYPE_NODE|(access), NULL, 0,		\
 	    handler, "N", descr, NULL);				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
 /* This constructs a node from which other oids can hang. */
 #define	SYSCTL_NODE(parent, nbr, name, access, handler, descr) \
 	SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, NULL)
 
 #define	SYSCTL_NODE_WITH_LABEL(parent, nbr, name, access, handler, descr, label) \
 	SYSCTL_OID_GLOBAL(parent, nbr, name, CTLTYPE_NODE|(access),	\
 	    NULL, 0, handler, "N", descr, label);			\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE)
 
 #define	SYSCTL_ADD_NODE(ctx, parent, nbr, name, access, handler, descr)	\
 	SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, \
 	    handler, descr, NULL)
 
 #define	SYSCTL_ADD_NODE_WITH_LABEL(ctx, parent, nbr, name, access, handler, descr, label) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE);	\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_NODE|(access),	\
 	    NULL, 0, handler, "N", __DESCR(descr), label);		\
 })
 
 #define	SYSCTL_ADD_ROOT_NODE(ctx, nbr, name, access, handler, descr)	\
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_NODE);	\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, &sysctl__children, nbr, name,		\
 	    CTLTYPE_NODE|(access),					\
 	    NULL, 0, handler, "N", __DESCR(descr), NULL);		\
 })
 
 /* Oid for a string.  len can be 0 to indicate '\0' termination. */
 #define	SYSCTL_STRING(parent, nbr, name, access, arg, len, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),			\
 	    arg, len, sysctl_handle_string, "A", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING)
 
 #define	SYSCTL_ADD_STRING(ctx, parent, nbr, name, access, arg, len, descr) \
 ({									\
 	char *__arg = (arg);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),			\
 	    __arg, len, sysctl_handle_string, "A", __DESCR(descr),	\
 	    NULL); \
 })
 
 /* Oid for a constant '\0' terminated string. */
 #define	SYSCTL_CONST_STRING(parent, nbr, name, access, arg, descr)	\
 	SYSCTL_OID(parent, nbr, name, CTLTYPE_STRING | CTLFLAG_MPSAFE | (access),\
 	    __DECONST(char *, arg), 0, sysctl_handle_string, "A", descr); \
 	CTASSERT(!((access) & CTLFLAG_WR));				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING)
 
 #define	SYSCTL_ADD_CONST_STRING(ctx, parent, nbr, name, access, arg, descr) \
 ({									\
 	char *__arg = __DECONST(char *, arg);				\
 	CTASSERT(!((access) & CTLFLAG_WR));				\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_STRING);	\
 	sysctl_add_oid(ctx, parent, nbr, name, CTLTYPE_STRING | 	\
 	    CTLFLAG_MPSAFE | (access), __arg, 0, sysctl_handle_string, "A",\
 	    __DESCR(descr), NULL); 					\
 })
 
 /* Oid for a bool.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_BOOL_PTR ((bool *)NULL)
 #define	SYSCTL_BOOL(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_bool, "CU", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 &&			\
 	    sizeof(bool) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_BOOL(ctx, parent, nbr, name, access, ptr, val, descr) \
 ({									\
 	bool *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0);				\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_bool, "CU", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid for a signed 8-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S8_PTR ((int8_t *)NULL)
 #define	SYSCTL_S8(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_8, "C", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8) && \
 	    sizeof(int8_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S8(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int8_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S8);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_8, "C", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 8-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U8_PTR ((uint8_t *)NULL)
 #define	SYSCTL_U8(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_8, "CU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8) && \
 	    sizeof(uint8_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U8(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint8_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U8);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U8 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_8, "CU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 16-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S16_PTR ((int16_t *)NULL)
 #define	SYSCTL_S16(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_16, "S", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16) && \
 	    sizeof(int16_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S16(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int16_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S16);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S16 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_16, "S", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 16-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U16_PTR ((uint16_t *)NULL)
 #define	SYSCTL_U16(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_16, "SU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16) && \
 	    sizeof(uint16_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U16(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint16_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U16);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U16 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_16, "SU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 32-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S32_PTR ((int32_t *)NULL)
 #define	SYSCTL_S32(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_32, "I", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32) && \
 	    sizeof(int32_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S32(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int32_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S32);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S32 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_32, "I", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 32-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U32_PTR ((uint32_t *)NULL)
 #define	SYSCTL_U32(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_32, "IU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32) && \
 	    sizeof(uint32_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U32(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint32_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U32);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U32 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_32, "IU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a signed 64-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_S64_PTR ((int64_t *)NULL)
 #define	SYSCTL_S64(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "Q", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
 	    sizeof(int64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_S64(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int64_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_64, "Q", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned 64-bit int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_U64_PTR ((uint64_t *)NULL)
 #define	SYSCTL_U64(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "QU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) && \
 	    sizeof(uint64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_U64(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	uint64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_64, "QU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an int.  If ptr is SYSCTL_NULL_INT_PTR, val is returned. */
 #define	SYSCTL_NULL_INT_PTR ((int *)NULL)
 #define	SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) \
 	SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, NULL)
 
 #define	SYSCTL_INT_WITH_LABEL(parent, nbr, name, access, ptr, val, descr, label) \
 	SYSCTL_OID_WITH_LABEL(parent, nbr, name,			\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    ptr, val, sysctl_handle_int, "I", descr, label);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT) && \
 	    sizeof(int) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_INT(ctx, parent, nbr, name, access, ptr, val, descr)	\
 ({									\
 	int *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_int, "I", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned int.  If ptr is NULL, val is returned. */
 #define	SYSCTL_NULL_UINT_PTR ((unsigned *)NULL)
 #define	SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_int, "IU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT) && \
 	    sizeof(unsigned) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UINT(ctx, parent, nbr, name, access, ptr, val, descr) \
 ({									\
 	unsigned *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_UINT);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, val, sysctl_handle_int, "IU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a long.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_LONG_PTR ((long *)NULL)
 #define	SYSCTL_LONG(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_LONG | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_long, "L", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG) && \
 	    sizeof(long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_LONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	long *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_LONG);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_LONG | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_long, "L", __DESCR(descr), NULL);	\
 })
 
 /* Oid for an unsigned long.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_ULONG_PTR ((unsigned long *)NULL)
 #define	SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access),			\
 	    ptr, val, sysctl_handle_long, "LU", descr);			\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG) &&	\
 	    sizeof(unsigned long) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_ULONG(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	unsigned long *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_ULONG);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_ULONG | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_long, "LU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a quad.  The pointer must be non NULL. */
 #define	SYSCTL_NULL_QUAD_PTR ((int64_t *)NULL)
 #define	SYSCTL_QUAD(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),		\
 	    ptr, val, sysctl_handle_64, "Q", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64) && \
 	    sizeof(int64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_QUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	int64_t *__ptr = (ptr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_64, "Q", __DESCR(descr), NULL);	\
 })
 
 #define	SYSCTL_NULL_UQUAD_PTR ((uint64_t *)NULL)
 #define	SYSCTL_UQUAD(parent, nbr, name, access, ptr, val, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	     ptr, val, sysctl_handle_64, "QU", descr);			\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
 	    sizeof(uint64_t) == sizeof(*(ptr)))
 
 #define	SYSCTL_ADD_UQUAD(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uint64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_64, "QU", __DESCR(descr), NULL);	\
 })
 
 /* Oid for a CPU dependent variable */
 #define	SYSCTL_ADD_UAUTO(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	struct sysctl_oid *__ret;					\
 	CTASSERT((sizeof(uint64_t) == sizeof(*(ptr)) ||			\
 	    sizeof(unsigned) == sizeof(*(ptr))) &&			\
 	    ((access) & CTLTYPE) == 0);					\
 	if (sizeof(uint64_t) == sizeof(*(ptr))) {			\
 		__ret = sysctl_add_oid(ctx, parent, nbr, name,		\
 		    CTLTYPE_U64 | CTLFLAG_MPSAFE | (access),		\
 		    (ptr), 0, sysctl_handle_64, "QU",			\
 		    __DESCR(descr), NULL);				\
 	} else {							\
 		__ret = sysctl_add_oid(ctx, parent, nbr, name,		\
 		    CTLTYPE_UINT | CTLFLAG_MPSAFE | (access),		\
 		    (ptr), 0, sysctl_handle_int, "IU",			\
 		    __DESCR(descr), NULL);				\
 	}								\
 	__ret;								\
 })
 
 /* Oid for a 64-bit unsigned counter(9).  The pointer must be non NULL. */
 #define	SYSCTL_COUNTER_U64(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    (ptr), 0, sysctl_handle_counter_u64, "QU", descr);		\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64) &&	\
 	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
 	    sizeof(uint64_t) == sizeof(**(ptr)))
 
 #define	SYSCTL_ADD_COUNTER_U64(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	counter_u64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_U64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    __ptr, 0, sysctl_handle_counter_u64, "QU", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid for an array of counter(9)s.  The pointer and length must be non zero. */
 #define	SYSCTL_COUNTER_U64_ARRAY(parent, nbr, name, access, ptr, len, descr) \
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    (ptr), (len), sysctl_handle_counter_u64_array, "QU", descr);\
 	CTASSERT((((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE) &&	\
 	    sizeof(counter_u64_t) == sizeof(*(ptr)) &&			\
 	    sizeof(uint64_t) == sizeof(**(ptr)))
 
 #define	SYSCTL_ADD_COUNTER_U64_ARRAY(ctx, parent, nbr, name, access,	\
     ptr, len, descr)							\
 ({									\
 	counter_u64_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_STATS | (access),	\
 	    __ptr, len, sysctl_handle_counter_u64_array, "S",		\
 	    __DESCR(descr), NULL);					\
 })
 
 /* Oid for an opaque object.  Specified by a pointer and a length. */
 #define	SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, len, sysctl_handle_opaque, fmt, descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE)
 
 #define	SYSCTL_ADD_OPAQUE(ctx, parent, nbr, name, access, ptr, len, fmt, descr)	\
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, len, sysctl_handle_opaque, fmt, __DESCR(descr), NULL);	\
 })
 
 /* Oid for a struct.  Specified by a pointer and a type. */
 #define	SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    ptr, sizeof(struct type), sysctl_handle_opaque,		\
 	    "S," #type, descr);						\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE)
 
 #define	SYSCTL_ADD_STRUCT(ctx, parent, nbr, name, access, ptr, type, descr) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_OPAQUE);	\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | (access),			\
 	    (ptr), sizeof(struct type),					\
 	    sysctl_handle_opaque, "S," #type, __DESCR(descr), NULL);	\
 })
 
 /* Oid for a procedure.  Specified by a pointer and an arg. */
 #define	SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 	SYSCTL_OID(parent, nbr, name, (access),				\
 	    ptr, arg, handler, fmt, descr);				\
 	CTASSERT(((access) & CTLTYPE) != 0)
 
 #define	SYSCTL_ADD_PROC(ctx, parent, nbr, name, access, ptr, arg, handler, fmt, descr) \
 ({									\
 	CTASSERT(((access) & CTLTYPE) != 0);				\
 	SYSCTL_ENFORCE_FLAGS(access);					\
 	sysctl_add_oid(ctx, parent, nbr, name, (access),		\
 	    (ptr), (arg), (handler), (fmt), __DESCR(descr), NULL);	\
 })
 
 /* Oid to handle limits on uma(9) zone specified by pointer. */
 #define	SYSCTL_UMA_MAX(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),		\
 	    (ptr), 0, sysctl_handle_uma_zone_max, "I", descr);	\
 	CTASSERT(((access) & CTLTYPE) == 0 ||			\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 
 #define	SYSCTL_ADD_UMA_MAX(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uma_zone_t __ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | (access),			\
 	    __ptr, 0, sysctl_handle_uma_zone_max, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* Oid to obtain current use of uma(9) zone specified by pointer. */
 #define	SYSCTL_UMA_CUR(parent, nbr, name, access, ptr, descr)		\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_handle_uma_zone_cur, "I", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 
 #define	SYSCTL_ADD_UMA_CUR(ctx, parent, nbr, name, access, ptr, descr)	\
 ({									\
 	uma_zone_t __ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_handle_uma_zone_cur, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a sbintime_t as microseconds */
 #define	SYSCTL_SBINTIME_USEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_usec_to_sbintime, "Q", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64)
 #define	SYSCTL_ADD_SBINTIME_USEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	sbintime_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_usec_to_sbintime, "Q", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a sbintime_t as milliseconds */
 #define	SYSCTL_SBINTIME_MSEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_msec_to_sbintime, "Q", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64)
 #define	SYSCTL_ADD_SBINTIME_MSEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	sbintime_t *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_S64);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_S64 | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_msec_to_sbintime, "Q", __DESCR(descr),	\
 	    NULL);							\
 })
 
 /* OID expressing a struct timeval as seconds */
 #define	SYSCTL_TIMEVAL_SEC(parent, nbr, name, access, ptr, descr)	\
 	SYSCTL_OID(parent, nbr, name,					\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    (ptr), 0, sysctl_sec_to_timeval, "I", descr);		\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT)
 #define	SYSCTL_ADD_TIMEVAL_SEC(ctx, parent, nbr, name, access, ptr, descr) \
 ({									\
 	struct timeval *__ptr = (ptr);					\
 	CTASSERT(((access) & CTLTYPE) == 0 ||				\
 	    ((access) & SYSCTL_CT_ASSERT_MASK) == CTLTYPE_INT);		\
 	sysctl_add_oid(ctx, parent, nbr, name,				\
 	    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RD | (access),	\
 	    __ptr, 0, sysctl_sec_to_timeval, "I", __DESCR(descr),	\
 	    NULL);							\
 })
 
 #define	SYSCTL_FOREACH(oidp, list) \
 	RB_FOREACH(oidp, sysctl_oid_list, list)
 
 /*
  * A macro to generate a read-only sysctl to indicate the presence of optional
  * kernel features.
  */
 #define	FEATURE(name, desc)						\
 	SYSCTL_INT_WITH_LABEL(_kern_features, OID_AUTO, name,		\
 	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1, desc, "feature")
 /* Same for the dynamic registration. */
 #define	FEATURE_ADD(name, desc)						\
 	sysctl_add_oid(NULL, SYSCTL_CHILDREN(&sysctl___kern_features),	\
 	    OID_AUTO, name,						\
 	    CTLFLAG_RD | CTLFLAG_CAPRD | CTLTYPE_INT | CTLFLAG_MPSAFE,	\
 	    NULL, 1, sysctl_handle_int, "I", desc, "feature");
 
+/*
+ * Adding new leaves to the 'debug.sizeof' MIB tree for ad-hoc reasons is
+ * discouraged, and in particular for reporting to developers the size of some
+ * kernel structures, which can be obtained by the following alternative means:
+ * 1. In GDB, load a full kernel image and use 'print(sizeof(struct XXX))'.
+ *    Alternatively, use 'ptype/o struct XXX' to additionally get the offsets
+ *    and size of all structure's fields.
+ * 2. If the structure is allocated from UMA, then 'vmstat -z' reports its size
+ *    (the mapping between structure types and zones is usually
+ *    straightforward).
+ */
+/* Generates a read-only sysctl reporting the size of an object/structure. */
+#define SYSCTL_SIZEOF(name, expr)					\
+	SYSCTL_INT(_debug_sizeof, OID_AUTO, name, CTLFLAG_RD,		\
+	    SYSCTL_NULL_INT_PTR, sizeof(expr),				\
+	    "sizeof(" __STRING(expr) ")");
+/* Same, specialized for structures. */
+#define SYSCTL_SIZEOF_STRUCT(struct_name)				\
+	SYSCTL_SIZEOF(struct_name, struct struct_name)
+
 #endif /* _KERNEL */
 
 /*
  * Top-level identifiers
  */
 #define	CTL_SYSCTL	0		/* "magic" numbers */
 #define	CTL_KERN	1		/* "high kernel": proc, limits */
 #define	CTL_VM		2		/* virtual memory */
 #define	CTL_VFS		3		/* filesystem, mount type is next */
 #define	CTL_NET		4		/* network, see socket.h */
 #define	CTL_DEBUG	5		/* debugging parameters */
 #define	CTL_HW		6		/* generic cpu/io */
 #define	CTL_MACHDEP	7		/* machine dependent */
 #define	CTL_USER	8		/* user-level */
 #define	CTL_P1003_1B	9		/* POSIX 1003.1B */
 
 /*
  * CTL_SYSCTL identifiers
  */
 #define	CTL_SYSCTL_DEBUG	0	/* printf all nodes */
 #define	CTL_SYSCTL_NAME		1	/* string name of OID */
 #define	CTL_SYSCTL_NEXT		2	/* next OID, honoring CTLFLAG_SKIP */
 #define	CTL_SYSCTL_NAME2OID	3	/* int array of name */
 #define	CTL_SYSCTL_OIDFMT	4	/* OID's kind and format */
 #define	CTL_SYSCTL_OIDDESCR	5	/* OID's description */
 #define	CTL_SYSCTL_OIDLABEL	6	/* aggregation label */
 #define	CTL_SYSCTL_NEXTNOSKIP	7	/* next OID, ignoring CTLFLAG_SKIP */
 
 /*
  * CTL_KERN identifiers
  */
 #define	KERN_OSTYPE		 1	/* string: system version */
 #define	KERN_OSRELEASE		 2	/* string: system release */
 #define	KERN_OSREV		 3	/* int: system revision */
 #define	KERN_VERSION		 4	/* string: compile time info */
 #define	KERN_MAXVNODES		 5	/* int: max vnodes */
 #define	KERN_MAXPROC		 6	/* int: max processes */
 #define	KERN_MAXFILES		 7	/* int: max open files */
 #define	KERN_ARGMAX		 8	/* int: max arguments to exec */
 #define	KERN_SECURELVL		 9	/* int: system security level */
 #define	KERN_HOSTNAME		10	/* string: hostname */
 #define	KERN_HOSTID		11	/* int: host identifier */
 #define	KERN_CLOCKRATE		12	/* struct: struct clockrate */
 /* was: #define	KERN_VNODE	13	; disabled in 2003 and removed in 2023 */
 #define	KERN_PROC		14	/* struct: process entries */
 #define	KERN_FILE		15	/* struct: file entries */
 #define	KERN_PROF		16	/* node: kernel profiling info */
 #define	KERN_POSIX1		17	/* int: POSIX.1 version */
 #define	KERN_NGROUPS		18	/* int: # of supplemental group ids */
 #define	KERN_JOB_CONTROL	19	/* int: is job control available */
 #define	KERN_SAVED_IDS		20	/* int: saved set-user/group-ID */
 #define	KERN_BOOTTIME		21	/* struct: time kernel was booted */
 #define	KERN_NISDOMAINNAME	22	/* string: YP domain name */
 #define	KERN_UPDATEINTERVAL	23	/* int: update process sleep time */
 #define	KERN_OSRELDATE		24	/* int: kernel release date */
 #define	KERN_NTP_PLL		25	/* node: NTP PLL control */
 #define	KERN_BOOTFILE		26	/* string: name of booted kernel */
 #define	KERN_MAXFILESPERPROC	27	/* int: max open files per proc */
 #define	KERN_MAXPROCPERUID	28	/* int: max processes per uid */
 #define	KERN_DUMPDEV		29	/* struct cdev *: device to dump on */
 #define	KERN_IPC		30	/* node: anything related to IPC */
 #define	KERN_DUMMY		31	/* unused */
 #define	KERN_PS_STRINGS		32	/* int: address of PS_STRINGS */
 #define	KERN_USRSTACK		33	/* int: address of USRSTACK */
 #define	KERN_LOGSIGEXIT		34	/* int: do we log sigexit procs? */
 #define	KERN_IOV_MAX		35	/* int: value of UIO_MAXIOV */
 #define	KERN_HOSTUUID		36	/* string: host UUID identifier */
 #define	KERN_ARND		37	/* int: from arc4rand() */
 #define	KERN_MAXPHYS		38	/* int: MAXPHYS value */
 #define	KERN_LOCKF		39	/* struct: lockf reports */
 /*
  * KERN_PROC subtypes
  */
 #define	KERN_PROC_ALL		0	/* everything */
 #define	KERN_PROC_PID		1	/* by process id */
 #define	KERN_PROC_PGRP		2	/* by process group id */
 #define	KERN_PROC_SESSION	3	/* by session of pid */
 #define	KERN_PROC_TTY		4	/* by controlling tty */
 #define	KERN_PROC_UID		5	/* by effective uid */
 #define	KERN_PROC_RUID		6	/* by real uid */
 #define	KERN_PROC_ARGS		7	/* get/set arguments/proctitle */
 #define	KERN_PROC_PROC		8	/* only return procs */
 #define	KERN_PROC_SV_NAME	9	/* get syscall vector name */
 #define	KERN_PROC_RGID		10	/* by real group id */
 #define	KERN_PROC_GID		11	/* by effective group id */
 #define	KERN_PROC_PATHNAME	12	/* path to executable */
 #define	KERN_PROC_OVMMAP	13	/* Old VM map entries for process */
 #define	KERN_PROC_OFILEDESC	14	/* Old file descriptors for process */
 #define	KERN_PROC_KSTACK	15	/* Kernel stacks for process */
 #define	KERN_PROC_INC_THREAD	0x10	/*
 					 * modifier for pid, pgrp, tty,
 					 * uid, ruid, gid, rgid and proc
 					 * This effectively uses 16-31
 					 */
 #define	KERN_PROC_VMMAP		32	/* VM map entries for process */
 #define	KERN_PROC_FILEDESC	33	/* File descriptors for process */
 #define	KERN_PROC_GROUPS	34	/* process groups */
 #define	KERN_PROC_ENV		35	/* get environment */
 #define	KERN_PROC_AUXV		36	/* get ELF auxiliary vector */
 #define	KERN_PROC_RLIMIT	37	/* process resource limits */
 #define	KERN_PROC_PS_STRINGS	38	/* get ps_strings location */
 #define	KERN_PROC_UMASK		39	/* process umask */
 #define	KERN_PROC_OSREL		40	/* osreldate for process binary */
 #define	KERN_PROC_SIGTRAMP	41	/* signal trampoline location */
 #define	KERN_PROC_CWD		42	/* process current working directory */
 #define	KERN_PROC_NFDS		43	/* number of open file descriptors */
 #define	KERN_PROC_SIGFASTBLK	44	/* address of fastsigblk magic word */
 #define	KERN_PROC_VM_LAYOUT	45	/* virtual address space layout info */
 #define	KERN_PROC_RLIMIT_USAGE	46	/* array of rlim_t */
 #define	KERN_PROC_KQUEUE	47	/* array of struct kinfo_knote */
 
 /*
  * KERN_IPC identifiers
  */
 #define	KIPC_MAXSOCKBUF		1	/* int: max size of a socket buffer */
 #define	KIPC_SOCKBUF_WASTE	2	/* int: wastage factor in sockbuf */
 #define	KIPC_SOMAXCONN		3	/* int: max length of connection q */
 #define	KIPC_MAX_LINKHDR	4	/* int: max length of link header */
 #define	KIPC_MAX_PROTOHDR	5	/* int: max length of network header */
 #define	KIPC_MAX_HDR		6	/* int: max total length of headers */
 #define	KIPC_MAX_DATALEN	7	/* int: max length of data? */
 
 /*
  * CTL_HW identifiers
  */
 #define	HW_MACHINE	 1		/* string: machine class */
 #define	HW_MODEL	 2		/* string: specific machine model */
 #define	HW_NCPU		 3		/* int: number of cpus */
 #define	HW_BYTEORDER	 4		/* int: machine byte order */
 #define	HW_PHYSMEM	 5		/* int: total memory */
 #define	HW_USERMEM	 6		/* int: non-kernel memory */
 #define	HW_PAGESIZE	 7		/* int: software page size */
 #define	HW_DISKNAMES	 8		/* strings: disk drive names */
 #define	HW_DISKSTATS	 9		/* struct: diskstats[] */
 #define	HW_FLOATINGPT	10		/* int: has HW floating point? */
 #define	HW_MACHINE_ARCH	11		/* string: machine architecture */
 #define	HW_REALMEM	12		/* int: 'real' memory */
 
 /*
  * CTL_USER definitions
  */
 #define	USER_CS_PATH		 1	/* string: _CS_PATH */
 #define	USER_BC_BASE_MAX	 2	/* int: BC_BASE_MAX */
 #define	USER_BC_DIM_MAX		 3	/* int: BC_DIM_MAX */
 #define	USER_BC_SCALE_MAX	 4	/* int: BC_SCALE_MAX */
 #define	USER_BC_STRING_MAX	 5	/* int: BC_STRING_MAX */
 #define	USER_COLL_WEIGHTS_MAX	 6	/* int: COLL_WEIGHTS_MAX */
 #define	USER_EXPR_NEST_MAX	 7	/* int: EXPR_NEST_MAX */
 #define	USER_LINE_MAX		 8	/* int: LINE_MAX */
 #define	USER_RE_DUP_MAX		 9	/* int: RE_DUP_MAX */
 #define	USER_POSIX2_VERSION	10	/* int: POSIX2_VERSION */
 #define	USER_POSIX2_C_BIND	11	/* int: POSIX2_C_BIND */
 #define	USER_POSIX2_C_DEV	12	/* int: POSIX2_C_DEV */
 #define	USER_POSIX2_CHAR_TERM	13	/* int: POSIX2_CHAR_TERM */
 #define	USER_POSIX2_FORT_DEV	14	/* int: POSIX2_FORT_DEV */
 #define	USER_POSIX2_FORT_RUN	15	/* int: POSIX2_FORT_RUN */
 #define	USER_POSIX2_LOCALEDEF	16	/* int: POSIX2_LOCALEDEF */
 #define	USER_POSIX2_SW_DEV	17	/* int: POSIX2_SW_DEV */
 #define	USER_POSIX2_UPE		18	/* int: POSIX2_UPE */
 #define	USER_STREAM_MAX		19	/* int: POSIX2_STREAM_MAX */
 #define	USER_TZNAME_MAX		20	/* int: POSIX2_TZNAME_MAX */
 #define	USER_LOCALBASE		21	/* string: _PATH_LOCALBASE */
 
 #define	CTL_P1003_1B_ASYNCHRONOUS_IO		1	/* boolean */
 #define	CTL_P1003_1B_MAPPED_FILES		2	/* boolean */
 #define	CTL_P1003_1B_MEMLOCK			3	/* boolean */
 #define	CTL_P1003_1B_MEMLOCK_RANGE		4	/* boolean */
 #define	CTL_P1003_1B_MEMORY_PROTECTION		5	/* boolean */
 #define	CTL_P1003_1B_MESSAGE_PASSING		6	/* boolean */
 #define	CTL_P1003_1B_PRIORITIZED_IO		7	/* boolean */
 #define	CTL_P1003_1B_PRIORITY_SCHEDULING	8	/* boolean */
 #define	CTL_P1003_1B_REALTIME_SIGNALS		9	/* boolean */
 #define	CTL_P1003_1B_SEMAPHORES			10	/* boolean */
 #define	CTL_P1003_1B_FSYNC			11	/* boolean */
 #define	CTL_P1003_1B_SHARED_MEMORY_OBJECTS	12	/* boolean */
 #define	CTL_P1003_1B_SYNCHRONIZED_IO		13	/* boolean */
 #define	CTL_P1003_1B_TIMERS			14	/* boolean */
 #define	CTL_P1003_1B_AIO_LISTIO_MAX		15	/* int */
 #define	CTL_P1003_1B_AIO_MAX			16	/* int */
 #define	CTL_P1003_1B_AIO_PRIO_DELTA_MAX		17	/* int */
 #define	CTL_P1003_1B_DELAYTIMER_MAX		18	/* int */
 #define	CTL_P1003_1B_MQ_OPEN_MAX		19	/* int */
 #define	CTL_P1003_1B_PAGESIZE			20	/* int */
 #define	CTL_P1003_1B_RTSIG_MAX			21	/* int */
 #define	CTL_P1003_1B_SEM_NSEMS_MAX		22	/* int */
 #define	CTL_P1003_1B_SEM_VALUE_MAX		23	/* int */
 #define	CTL_P1003_1B_SIGQUEUE_MAX		24	/* int */
 #define	CTL_P1003_1B_TIMER_MAX			25	/* int */
 
 #ifdef _KERNEL
 
 #define	CTL_P1003_1B_MAXID		26
 
 /*
  * Declare some common oids.
  */
 extern struct sysctl_oid_list sysctl__children;
 SYSCTL_DECL(_kern);
 SYSCTL_DECL(_kern_features);
 SYSCTL_DECL(_kern_ipc);
 SYSCTL_DECL(_kern_proc);
 SYSCTL_DECL(_kern_sched);
 SYSCTL_DECL(_kern_sched_stats);
 SYSCTL_DECL(_sysctl);
 SYSCTL_DECL(_vm);
 SYSCTL_DECL(_vm_stats);
 SYSCTL_DECL(_vm_stats_misc);
 SYSCTL_DECL(_vfs);
 SYSCTL_DECL(_net);
 SYSCTL_DECL(_debug);
 SYSCTL_DECL(_debug_sizeof);
 SYSCTL_DECL(_dev);
 SYSCTL_DECL(_hw);
 SYSCTL_DECL(_hw_bus);
 SYSCTL_DECL(_hw_bus_devices);
 SYSCTL_DECL(_machdep);
 SYSCTL_DECL(_machdep_mitigations);
 SYSCTL_DECL(_user);
 SYSCTL_DECL(_compat);
 SYSCTL_DECL(_regression);
 SYSCTL_DECL(_security);
 SYSCTL_DECL(_security_bsd);
 
 extern const char	machine[];
 extern const char	osrelease[];
 extern const char	ostype[];
 extern const char	kern_ident[];
 
 /* Dynamic oid handling */
 struct sysctl_oid *sysctl_add_oid(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid_list *parent, int nbr, const char *name, int kind,
 	    void *arg1, intmax_t arg2, int (*handler)(SYSCTL_HANDLER_ARGS),
 	    const char *fmt, const char *descr, const char *label);
 int	sysctl_remove_name(struct sysctl_oid *parent, const char *name, int del,
 	    int recurse);
 void	sysctl_rename_oid(struct sysctl_oid *oidp, const char *name);
 int	sysctl_move_oid(struct sysctl_oid *oidp,
 	    struct sysctl_oid_list *parent);
 int	sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse);
 int	sysctl_ctx_init(struct sysctl_ctx_list *clist);
 int	sysctl_ctx_free(struct sysctl_ctx_list *clist);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_add(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 struct	sysctl_ctx_entry *sysctl_ctx_entry_find(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 int	sysctl_ctx_entry_del(struct sysctl_ctx_list *clist,
 	    struct sysctl_oid *oidp);
 
 int	kernel_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 	    size_t *oldlenp, void *new, size_t newlen, size_t *retval,
 	    int flags);
 int	kernel_sysctlbyname(struct thread *td, char *name, void *old,
 	    size_t *oldlenp, void *new, size_t newlen, size_t *retval,
 	    int flags);
 int	userland_sysctl(struct thread *td, int *name, u_int namelen, void *old,
 	    size_t *oldlenp, int inkernel, const void *new, size_t newlen,
 	    size_t *retval, int flags);
 int	sysctl_find_oid(int *name, u_int namelen, struct sysctl_oid **noid,
 	    int *nindx, struct sysctl_req *req);
 void	sysctl_wlock(void);
 void	sysctl_wunlock(void);
 int	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len);
 int	kern___sysctlbyname(struct thread *td, const char *name,
 	    size_t namelen, void *old, size_t *oldlenp, void *new,
 	    size_t newlen, size_t *retval, int flags, bool inkernel);
 
 struct sbuf;
 struct sbuf *sbuf_new_for_sysctl(struct sbuf *, char *, int,
 	    struct sysctl_req *);
 #else	/* !_KERNEL */
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 __BEGIN_DECLS
 int	sysctl(const int *, unsigned int, void *, size_t *, const void *, size_t);
 int	sysctlbyname(const char *, void *, size_t *, const void *, size_t);
 int	sysctlnametomib(const char *, int *, size_t *);
 __END_DECLS
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_SYSCTL_H_ */