Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 310958)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 310959)
@@ -1,2517 +1,2517 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
 #include <sys/jail.h>
 #include "zfs_comutil.h"
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 
 int zfs_super_owner;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
     "File system owner can perform privileged operation on his file systems");
 
 int zfs_debug_level;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
     "Debug level");
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 static int zfs_version_acl = ZFS_ACL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
     "ZFS_ACL_VERSION");
 static int zfs_version_spa = SPA_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     "SPA_VERSION");
 static int zfs_version_zpl = ZPL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     "ZPL_VERSION");
 
 static int zfs_mount(vfs_t *vfsp);
 static int zfs_umount(vfs_t *vfsp, int fflag);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor);
 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_checkexp =		zfs_checkexp,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	/*
 	 * Ignore the system syncher.  ZFS already commits async data
 	 * at zfs_txg_timeout intervals.
 	 */
 	if (waitfor == MNT_LAZY)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
 
 		/*
 		 * If the system is shutting down, then skip any
 		 * filesystems which may exist on a suspended pool.
 		 */
 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, 0);
 
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 #ifndef __FreeBSD_kernel__
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
 		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
 			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
 				 * number space.  If we're using a getudev()'ed
 				 * major number, we can use all of its minors.
 				 */
 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 					zfs_minor = ZFS_MIN_MINOR;
 				else
 					zfs_minor = 0;
 			} else {
 				zfs_minor++;
 			}
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
 			 * current major number.  Create a new major number.
 			 */
 			if ((new_major = getudev()) == (major_t)-1) {
 				cmn_err(CE_WARN,
 				    "zfs_mount: Can't get unique major "
 				    "device number.");
 				return (-1);
 			}
 			mutex_enter(&zfs_dev_mtx);
 			zfs_major = new_major;
 			zfs_minor = 0;
 
 			mutex_exit(&zfs_dev_mtx);
 		} else {
 			break;
 		}
 		/* CONSTANTCONDITION */
 	} while (1);
 
 	return (0);
 }
 #endif	/* !__FreeBSD_kernel__ */
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 	ASSERT(ISP2(newval));
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 /*
  * The nbmand mount option can be changed at mount time.
  * We can't allow it to be toggled on live file systems or incorrect
  * behavior may be seen from cifs clients
  *
  * This property isn't registered via dsl_prop_register(), but this callback
  * will be called when a file system is first mounted
  */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	if (newval == FALSE) {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 	} else {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_vscan = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
 	boolean_t readonly = B_FALSE;
 	boolean_t do_readonly = B_FALSE;
 	boolean_t setuid = B_FALSE;
 	boolean_t do_setuid = B_FALSE;
 	boolean_t exec = B_FALSE;
 	boolean_t do_exec = B_FALSE;
 #ifdef illumos
 	boolean_t devices = B_FALSE;
 	boolean_t do_devices = B_FALSE;
 #endif
 	boolean_t xattr = B_FALSE;
 	boolean_t do_xattr = B_FALSE;
 	boolean_t atime = B_FALSE;
 	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * This function can be called for a snapshot when we update snapshot's
 	 * mount point, which isn't really supported.
 	 */
 	if (dmu_objset_is_snapshot(os))
 		return (EOPNOTSUPP);
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 		atime = B_FALSE;
 		do_atime = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 		atime = B_TRUE;
 		do_atime = B_TRUE;
 	}
 
 	/*
 	 * We need to enter pool configuration here, so that we can use
 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
 	 * dsl_prop_get_integer() can not be used, because it has to acquire
 	 * spa_namespace_lock and we can not do that because we already hold
 	 * z_teardown_lock.  The problem is that spa_config_sync() is called
 	 * with spa_namespace_lock held and the function calls ZFS vnode
 	 * operations to write the cache file and thus z_teardown_lock is
 	 * acquired after spa_namespace_lock.
 	 */
 	ds = dmu_objset_ds(os);
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 
 	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
 	 * This is weird, but it is documented to only be changeable
 	 * at mount time.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
 	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		return (error);
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	error = dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 #ifdef illumos
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 #endif
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 	    zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
 		atime_changed_cb(zfsvfs, atime);
 
 	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
 unregister:
 	dsl_prop_unregister_all(ds, zfsvfs);
 	return (error);
 }
 
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * If we have a NULL data pointer
 	 * then assume the id's aren't changing and
 	 * return EEXIST to the dmu to let it know to
 	 * use the same ids
 	 */
 	if (data == NULL)
 		return (SET_ERROR(EEXIST));
 
 	if (bonustype == DMU_OT_ZNODE) {
 		znode_phys_t *znp = data;
 		*userp = znp->zp_uid;
 		*groupp = znp->zp_gid;
 	} else {
 		int hdrsize;
 		sa_hdr_phys_t *sap = data;
 		sa_hdr_phys_t sa = *sap;
 		boolean_t swap = B_FALSE;
 
 		ASSERT(bonustype == DMU_OT_SA);
 
 		if (sa.sa_magic == 0) {
 			/*
 			 * This should only happen for newly created
 			 * files that haven't had the znode data filled
 			 * in yet.
 			 */
 			*userp = 0;
 			*groupp = 0;
 			return (0);
 		}
 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 			sa.sa_magic = SA_MAGIC;
 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 			swap = B_TRUE;
 		} else {
 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 		}
 
 		hdrsize = sa_hdrsize(&sa);
 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_UID_OFFSET));
 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_GID_OFFSET));
 		if (swap) {
 			*userp = BSWAP_64(*userp);
 			*groupp = BSWAP_64(*groupp);
 		}
 	}
 	return (0);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
 	uint64_t fuid;
 	const char *domain;
 
 	fuid = strtonum(fuidstr, NULL);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 	if (domain)
 		(void) strlcpy(domainbuf, domain, buflen);
 	else
 		domainbuf[0] = '\0';
 	*ridp = FUID_RID(fuid);
 }
 
 static uint64_t
 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 {
 	switch (type) {
 	case ZFS_PROP_USERUSED:
 		return (DMU_USERUSED_OBJECT);
 	case ZFS_PROP_GROUPUSED:
 		return (DMU_GROUPUSED_OBJECT);
 	case ZFS_PROP_USERQUOTA:
 		return (zfsvfs->z_userquota_obj);
 	case ZFS_PROP_GROUPQUOTA:
 		return (zfsvfs->z_groupquota_obj);
 	}
 	return (0);
 }
 
 int
 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 {
 	int error;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zfs_useracct_t *buf = vbuf;
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
 		*bufsizep = 0;
 		return (0);
 	}
 
 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 		    *bufsizep)
 			break;
 
 		fuidstr_to_sid(zfsvfs, za.za_name,
 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 
 		buf->zu_space = za.za_first_integer;
 		buf++;
 	}
 	if (error == ENOENT)
 		error = 0;
 
 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 	*cookiep = zap_cursor_serialize(&zc);
 	zap_cursor_fini(&zc);
 	return (error);
 }
 
 /*
  * buf must be big enough (eg, 32 bytes)
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
     char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
 
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
 			return (SET_ERROR(ENOENT));
 	}
 	fuid = FUID_ENCODE(domainid, rid);
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
 int
 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valp)
 {
 	char buf[32];
 	int err;
 	uint64_t obj;
 
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 int
 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota)
 {
 	char buf[32];
 	int err;
 	dmu_tx_t *tx;
 	uint64_t *objp;
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 		return (SET_ERROR(EINVAL));
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 		return (SET_ERROR(ENOTSUP));
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 	if (*objp == 0) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    zfs_userquota_prop_prefixes[type]);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	mutex_enter(&zfsvfs->z_lock);
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 	dmu_tx_commit(tx);
 	return (err);
 }
 
 boolean_t
 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
 	int err;
 
 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
 
 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 	if (err != 0)
 		return (B_FALSE);
 	return (used >= quota);
 }
 
 boolean_t
 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
 	uint64_t fuid;
 	uint64_t quotaobj;
 
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
 /*
  * Associate this zfsvfs with the given objset, which must be owned.
  * This will cache a bunch of on-disk state from the objset in the
  * zfsvfs.
  */
 static int
 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 {
 	int error;
 	uint64_t val;
 
 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error != 0)
 		return (error);
 	if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 		return (SET_ERROR(ENOTSUP));
 	}
 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_norm = (int)val;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_utf8 = (val != 0);
 
 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_case = (uint_t)val;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 	    zfsvfs->z_case == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
 	uint64_t sa_obj = 0;
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error != 0)
 			return (error);
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 	if (error != 0)
 		return (error);
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error != 0)
 		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
 	if (error == ENOENT)
 		zfsvfs->z_userquota_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
 	if (error == ENOENT)
 		zfsvfs->z_groupquota_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
 	if (error == ENOENT)
 		zfsvfs->z_fuid_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
 	if (error == ENOENT)
 		zfsvfs->z_shares_dir = 0;
 	else if (error != 0)
 		return (error);
 
 	/*
 	 * Only use the name cache if we are looking for a
 	 * name on a file system that does not require normalization
 	 * or case folding.  We can also look there if we happen to be
 	 * on a non-normalizing, mixed sensitivity file system IF we
 	 * are looking for the exact name (which is always the case on
 	 * FreeBSD).
 	 */
 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
 
 	return (0);
 }
 
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	/*
 	 * XXX: Fix struct statfs so this isn't necessary!
 	 *
 	 * The 'osname' is used as the filesystem's special node, which means
 	 * it must fit in statfs.f_mntfromname, or else it can't be
 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 	 * 'zfs unmount' to think it's not mounted when it is.
 	 */
 	if (strlen(osname) >= MNAMELEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	/*
 	 * We claim to always be readonly so we can open snapshots;
 	 * other ZPL code will prevent us from writing to snapshots.
 	 */
 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 	if (error) {
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	zfsvfs->z_vfs = NULL;
 	zfsvfs->z_parent = zfsvfs;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 #ifdef DIAGNOSTIC
 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
 #else
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
 #endif
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	error = zfsvfs_init(zfsvfs, os);
 	if (error != 0) {
 		dmu_objset_disown(os, zfsvfs);
 		*zfvp = NULL;
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	*zfvp = zfsvfs;
 	return (0);
 }
 
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
 	if (error)
 		return (error);
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
 		boolean_t readonly;
 
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 		if (readonly != 0)
 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		else
 			zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 *
 		 * Because of ziltest, this must be done after
 		 * zfs_unlinked_drain().  (Further note: ziltest
 		 * doesn't use readonly mounts, where
 		 * zfs_unlinked_drain() isn't called.)  This is because
 		 * ziltest causes spa_sync() to think it's committed,
 		 * but actually it is not, so the intent log contains
 		 * many txg's worth of changes.
 		 *
 		 * In particular, if object N is in the unlinked set in
 		 * the last txg to actually sync, then it could be
 		 * actually freed in a later txg and then reallocated
 		 * in a yet later txg.  This would write a "create
 		 * object N" record to the intent log.  Normally, this
 		 * would be fine because the spa_sync() would have
 		 * written out the fact that object N is free, before
 		 * we could write the "create object N" intent log
 		 * record.
 		 *
 		 * But when we are in ziltest mode, we advance the "open
 		 * txg" without actually spa_sync()-ing the changes to
 		 * disk.  So we would see that object N is still
 		 * allocated and in the unlinked set, and there is an
 		 * intent log record saying to allocate it.
 		 */
 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
 			if (zil_replay_disable) {
 				zil_destroy(zfsvfs->z_log, B_FALSE);
 			} else {
 				zfsvfs->z_replay = B_TRUE;
 				zil_replay(zfsvfs->z_os, zfsvfs,
 				    zfs_replay_vector);
 				zfsvfs->z_replay = B_FALSE;
 			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
 	return (0);
 }
 
 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
 	 * zfs_znode_move() until we can safely ensure that the filesystem is
 	 * not unmounted. We consider the filesystem valid before the barrier
 	 * and invalid after the barrier.
 	 */
 	rw_enter(&zfsvfs_lock, RW_READER);
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	if (zfsvfs->z_vfs) {
 		if (zfsvfs->z_use_fuids) {
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		} else {
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		}
 	}
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	error = zfsvfs_create(osname, &zfsvfs);
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
 
 #ifdef illumos
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
 	if (zfs_create_unique_device(&mount_dev) == -1) {
 		error = SET_ERROR(ENODEV);
 		goto out;
 	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 #endif
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	vfsp->vfs_fsid.val[0] = fsid_guid;
 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
 	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_inc_32(&zfs_active_fs_count);
 	}
 
 	return (error);
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 
 	if (!dmu_objset_is_snapshot(os))
 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
 }
 
 #ifdef SECLABEL
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
 static int
 str_to_uint64(char *str, uint64_t *objnum)
 {
 	uint64_t num = 0;
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
 			return (SET_ERROR(EINVAL));
 
 		num = num*10 + *str++ - '0';
 	}
 
 	*objnum = num;
 	return (0);
 }
 
 /*
  * The boot path passed from the boot loader is in the form of
  * "rootpool-name/root-filesystem-object-number'. Convert this
  * string to a dataset name: "rootpool-name/root-filesystem-name".
  */
 static int
 zfs_parse_bootfs(char *bpath, char *outpath)
 {
 	char *slashp;
 	uint64_t objnum;
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(outpath, bpath);
 
 	slashp = strchr(bpath, '/');
 
 	/* if no '/', just return the pool name */
 	if (slashp == NULL) {
 		return (0);
 	}
 
 	/* if not a number, just return the root dataset name */
 	if (str_to_uint64(slashp+1, &objnum)) {
 		return (0);
 	}
 
 	*slashp = '\0';
 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
 	*slashp = '/';
 
 	return (error);
 }
 
 /*
  * Check that the hex label string is appropriate for the dataset being
  * mounted into the global_zone proper.
  *
  * Return an error if the hex label string is not default or
  * admin_low/admin_high.  For admin_low labels, the corresponding
  * dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
 		/* must be readonly */
 		uint64_t rdonly;
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
 			return (SET_ERROR(EACCES));
 		return (rdonly ? 0 : EACCES);
 	}
 	return (SET_ERROR(EACCES));
 }
 
 /*
  * Determine whether the mount is allowed according to MAC check.
  * by comparing (where appropriate) label of the dataset against
  * the label of the zone being mounted into.  If the dataset has
  * no label, create one.
  *
  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
 	bslabel_t	*mnt_sl;
 	bslabel_t	ds_sl;
 	char		ds_hexsl[MAXNAMELEN];
 
 	retv = EACCES;				/* assume the worst */
 
 	/*
 	 * Start by getting the dataset label if it exists.
 	 */
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
 		return (SET_ERROR(EACCES));
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
 	 * which have a non-default label already.  No other label checks
 	 * are needed.
 	 */
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Get the label of the mountpoint.  If mounting into the global
 	 * zone (i.e. mountpoint is not within an active zone and the
 	 * zoned property is off), the label must be default or
 	 * admin_low/admin_high only; no other checks are needed.
 	 */
 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
 	if (mntzone->zone_id == GLOBAL_ZONEID) {
 		uint64_t zoned;
 
 		zone_rele(mntzone);
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 			return (SET_ERROR(EACCES));
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
 			/*
 			 * This is the case of a zone dataset being mounted
 			 * initially, before the zone has been fully created;
 			 * allow this mount into global zone.
 			 */
 			return (0);
 	}
 
 	mnt_tsl = mntzone->zone_slabel;
 	ASSERT(mnt_tsl != NULL);
 	label_hold(mnt_tsl);
 	mnt_sl = label2bslabel(mnt_tsl);
 
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
 		/*
 		 * The dataset doesn't have a real label, so fabricate one.
 		 */
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
 		    dsl_prop_set_string(osname,
 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 		    ZPROP_SRC_LOCAL, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
 		/*
 		 * Now compare labels to complete the MAC check.  If the
 		 * labels are equal then allow access.  If the mountpoint
 		 * label dominates the dataset label, allow readonly access.
 		 * Otherwise, access is denied.
 		 */
 		if (blequal(mnt_sl, &ds_sl))
 			retv = 0;
 		else if (bldominates(mnt_sl, &ds_sl)) {
 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 			retv = 0;
 		}
 	}
 
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
 }
 #endif	/* SECLABEL */
 
 #ifdef OPENSOLARIS_MOUNTROOT
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
 	int error = 0;
 	static int zfsrootdone = 0;
 	zfsvfs_t *zfsvfs = NULL;
 	znode_t *zp = NULL;
 	vnode_t *vp = NULL;
 	char *zfs_bootfs;
 	char *zfs_devid;
 
 	ASSERT(vfsp);
 
 	/*
 	 * The filesystem that we mount as root is defined in the
 	 * boot property "zfs-bootfs" with a format of
 	 * "poolname/root-dataset-objnum".
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
 			return (SET_ERROR(EBUSY));
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
 		 * something better by looking at the timestamp on
 		 * an uberblock, so just set it to -1.
 		 */
 		clkset(-1);
 
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
 			return (SET_ERROR(EINVAL));
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
 		if (zfs_devid)
 			spa_free_bootprop(zfs_devid);
 		if (error) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
 			    error);
 			return (error);
 		}
 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
 			    error);
 			return (error);
 		}
 
 		spa_free_bootprop(zfs_bootfs);
 
 		if (error = vfs_lock(vfsp))
 			return (error);
 
 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
 			goto out;
 		}
 
 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
 		ASSERT(zfsvfs);
 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
 			goto out;
 		}
 
 		vp = ZTOV(zp);
 		mutex_enter(&vp->v_lock);
 		vp->v_flag |= VROOT;
 		mutex_exit(&vp->v_lock);
 		rootvp = vp;
 
 		/*
 		 * Leave rootvp held.  The root file system is never unmounted.
 		 */
 
 		vfs_add((struct vnode *)0, vfsp,
 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
 out:
 		vfs_unlock(vfsp);
 		return (error);
 	} else if (why == ROOT_REMOUNT) {
 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
 		vfsp->vfs_flag |= VFS_REMOUNT;
 
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		return (zfs_register_callbacks(vfsp));
 
 	} else if (why == ROOT_UNMOUNT) {
 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
 		(void) zfs_sync(vfsp, 0, 0);
 		return (0);
 	}
 
 	/*
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
 	return (SET_ERROR(ENOTSUP));
 }
 #endif	/* OPENSOLARIS_MOUNTROOT */
 
 static int
 getpoolname(const char *osname, char *poolname)
 {
 	char *p;
 
 	p = strchr(osname, '/');
 	if (p == NULL) {
 		if (strlen(osname) >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strcpy(poolname, osname);
 	} else {
 		if (p - osname >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(poolname, osname, p - osname);
 		poolname[p - osname] = '\0';
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp)
 {
 	kthread_t	*td = curthread;
 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
 	cred_t		*cr = td->td_ucred;
 	char		*osname;
 	int		error = 0;
 	int		canwrite;
 
 #ifdef illumos
 	if (mvp->v_type != VDIR)
 		return (SET_ERROR(ENOTDIR));
 
 	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (SET_ERROR(EBUSY));
 	}
 	mutex_exit(&mvp->v_lock);
 
 	/*
 	 * ZFS does not support passing unparsed data in via MS_DATA.
 	 * Users should use the MS_OPTIONSTR interface; this means
 	 * that all option parsing is already done and the options struct
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
 #else	/* !illumos */
 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
 		return (SET_ERROR(EPERM));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 		return (SET_ERROR(EINVAL));
 #endif	/* illumos */
 
 	/*
 	 * If full-owner-access is enabled and delegated administration is
 	 * turned on, we must set nosuid.
 	 */
 	if (zfs_super_owner &&
 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Check for mount privilege?
 	 *
 	 * If we don't have privilege then see if
 	 * we have local permission to allow it
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
 			goto out;
 
 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
 			vattr_t		vattr;
 
 			/*
 			 * Make sure user is the owner of the mount point
 			 * or has sufficient privileges.
 			 */
 
 			vattr.va_mask = AT_UID;
 
 			vn_lock(mvp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(mvp, &vattr, cr)) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 			VOP_UNLOCK(mvp, 0);
 		}
 
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 #ifdef SECLABEL
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
 #endif
 
 	vfsp->vfs_flag |= MNT_NFS4ACLS;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT) {
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 		/*
 		 * Refresh mount options with z_teardown_lock blocking I/O while
 		 * the filesystem is in an inconsistent state.
 		 * The lock also serializes this code with filesystem
 		 * manipulations between entry to zfs_suspend_fs() and return
 		 * from zfs_resume_fs().
 		 */
 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfs_unregister_callbacks(zfsvfs);
 		error = zfs_register_callbacks(vfsp);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		goto out;
 	}
 
 	/* Initial root mount: try hard to import the requested root pool. */
 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
 		char pname[MAXNAMELEN];
 
 		error = getpoolname(osname, pname);
 		if (error == 0)
 			error = spa_import_rootpool(pname);
 		if (error)
 			goto out;
 	}
 	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
 
 #ifdef illumos
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
 #endif
 
 out:
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = SPA_MINBLOCKSIZE;
 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = MAXNAMELEN - 1;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
 
 	ZFS_EXIT(zfsvfs);
 
 	if (error == 0) {
 		error = vn_lock(*vpp, flags);
 		if (error != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 		}
 	}
 	return (error);
 }
 
 /*
  * Teardown the zfsvfs::z_os.
  *
  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
  */
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;
 
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
 		 * We purge the parent filesystem's vfsp as the parent
 		 * filesystem and all of its snapshots have their vnode's
 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 #ifdef FREEBSD_NAMECACHE
-		cache_purgevfs(zfsvfs->z_parent->z_vfs);
+		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
 #endif
 	}
 
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 
 	/*
 	 * If we are not unmounting (ie: online recv) and someone already
 	 * unmounted this file system while we were doing the switcheroo,
 	 * or a reopen of z_os failed then just bail out now.
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * At this point there are no vops active, and any new vops will
 	 * fail with EIO since we have z_teardown_lock for writer (only
 	 * relavent for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
 		if (zp->z_sa_hdl) {
 			ASSERT(ZTOV(zp)->v_count >= 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * If we are unmounting, set the unmounted flag and let new vops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
 	 * other vops will fail with EIO.
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
 	/*
 	 * z_os will be NULL if there was an error in attempting to reopen
 	 * zfsvfs, so just return as the properties had already been
 	 * unregistered and cached data had been evicted before.
 	 */
 	if (zfsvfs->z_os == NULL)
 		return (0);
 
 	/*
 	 * Unregister properties.
 	 */
 	zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 	dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag)
 {
 	kthread_t *td = curthread;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
 
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
 	 * referential for non-snapshots.
 	 */
 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	if (fflag & MS_FORCE) {
 		/*
 		 * Mark file system as unmounted before calling
 		 * vflush(FORCECLOSE). This way we ensure no future vnops
 		 * will be called and risk operating on DOOMED vnodes.
 		 */
 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 #ifdef illumos
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
 		 * Our count is maintained in the vfs structure, but the
 		 * number is off by 1 to indicate a hold on the vfs
 		 * structure itself.
 		 *
 		 * The '.zfs' directory maintains a reference of its
 		 * own, and any active references underneath are
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
 			if (vfsp->vfs_count > 1)
 				return (SET_ERROR(EBUSY));
 		} else {
 			if (vfsp->vfs_count > 2 ||
 			    zfsvfs->z_ctldir->v_count > 1)
 				return (SET_ERROR(EBUSY));
 		}
 	}
 #endif
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
 
 	/*
 	 * z_os will be NULL if there was an error in
 	 * attempting to reopen zfsvfs.
 	 */
 	if (os != NULL) {
 		/*
 		 * Unset the objset user_ptr.
 		 */
 		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
 		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
 		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	/*
 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
 	 * This will make NFS to switch to LOOKUP instead of using VGET.
 	 */
 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
 		return (EOPNOTSUPP);
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		vrele(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
 		*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	if (err == 0)
 		err = vn_lock(*vpp, flags);
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * If this is regular file system vfsp is the same as
 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
 	 * zfsvfs->z_parent->z_vfs represents parent file system
 	 * which we have to use here, because only this file system
 	 * has mnt_export configured.
 	 */
 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
 	    credanonp, numsecflavors, secflavors));
 }
 
 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int 		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * On FreeBSD we can get snapshot's mount point or its parent file
 	 * system mount point depending if snapshot is already mounted or not.
 	 */
 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (SET_ERROR(EINVAL));
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
 	 * we are in the .zfs/shares directory tree.
 	 */
 	if ((fid_gen == 0 &&
 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else if (object == zfsvfs->z_shares_dir) {
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			vref(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = vn_lock(*vpp, flags);
 		if (err != 0)
 			*vpp = NULL;
 		return (err);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (uint64_t));
 	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		vrele(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	err = vn_lock(*vpp, flags | LK_RETRY);
 	if (err == 0)
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
 	else
 		*vpp = NULL;
 	return (err);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
  * dataset and objset intact so that they can be atomically handed off during
  * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
  * is an invariant across any of the operations that can be performed while the
  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
  * are the same: the relevant objset and associated dataset are owned by
  * zfsvfs, held, and long held on entry.
  */
 int
 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 {
 	int err;
 	znode_t *zp;
 
 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
 	/*
 	 * We already own this, so just update the objset_t, as the one we
 	 * had before may have been evicted.
 	 */
 	objset_t *os;
 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
 	VERIFY(dsl_dataset_long_held(ds));
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	err = zfsvfs_init(zfsvfs, os);
 	if (err != 0)
 		goto bail;
 
 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	/*
 	 * Attempt to re-establish all the active znodes with
 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
 	 * when they try to use their znode.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 		(void) zfs_rezget(zp);
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
 		 * Since we couldn't setup the sa framework, try to force
 		 * unmount this file system.
 		 */
 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
 			vfs_ref(zfsvfs->z_vfs);
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
 		}
 	}
 	return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 #ifdef illumos
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
 	 * skip the VFS_RELE for rootvfs.
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
 #endif
 
 	zfsvfs_free(zfsvfs);
 
 	atomic_dec_32(&zfs_active_fs_count);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int newdesiredvnodes;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode))));
 	if (newdesiredvnodes == desiredvnodes)
 		desiredvnodes = (3 * newdesiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
 	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (SET_ERROR(EINVAL));
 
 	if (newvers < zfsvfs->z_version)
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_spa_version_map(newvers) >
 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
 		return (SET_ERROR(ENOTSUP));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    ZFS_SA_ATTRS);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		uint64_t sa_obj;
 
 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
 		    SPA_VERSION_SA);
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 
 		error = zap_add(os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
 	    "from %llu to %llu", zfsvfs->z_version, newvers);
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
 	const char *pname;
 	int error = ENOENT;
 
 	/*
 	 * Look up the file system's value for the property.  For the
 	 * version property, we look up a slightly different string.
 	 */
 	if (prop == ZFS_PROP_VERSION)
 		pname = ZPL_VERSION_STR;
 	else
 		pname = zfs_prop_to_name(prop);
 
 	if (os != NULL)
 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
 
 	if (error == ENOENT) {
 		/* No value set, use the default value */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 			*value = ZPL_VERSION;
 			break;
 		case ZFS_PROP_NORMALIZE:
 		case ZFS_PROP_UTF8ONLY:
 			*value = 0;
 			break;
 		case ZFS_PROP_CASE:
 			*value = ZFS_CASE_SENSITIVE;
 			break;
 		default:
 			return (error);
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #ifdef _KERNEL
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 	char tmpbuf[MAXPATHLEN];
 	struct mount *mp;
 	char *fromname;
 	size_t oldlen;
 
 	oldlen = strlen(oldname);
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		fromname = mp->mnt_stat.f_mntfromname;
 		if (strcmp(fromname, oldname) == 0) {
 			(void)strlcpy(fromname, newname,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 		if (strncmp(fromname, oldname, oldlen) == 0 &&
 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
 			    newname, fromname + oldlen);
 			(void)strlcpy(fromname, tmpbuf,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 #endif
Index: stable/11/sys/kern/subr_witness.c
===================================================================
--- stable/11/sys/kern/subr_witness.c	(revision 310958)
+++ stable/11/sys/kern/subr_witness.c	(revision 310959)
@@ -1,3018 +1,3026 @@
 /*-
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(1024 + MAXCPU)
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rw },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_rw },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_rw },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "vm pagequeue", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
+	 * VFS namecache
+	 */
+	{ "ncvn", &lock_class_mtx_sleep },
+	{ "ncbuc", &lock_class_rw },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ "ncneg", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 	{ "scrlock", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ "process slock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 static void
 witness_initialize(void *dummy __unused)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	int i;
 
 	w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS,
 	    M_WAITOK | M_ZERO);
 
 	w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1),
 	    M_WITNESS, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) *
 		    (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
     NULL);
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				witness_voutput(fmt, ap);
 				va_end(ap);
 				witness_output(
 				    " with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, witness_output);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		witness_voutput(fmt, ap);
 		va_end(ap);
 		witness_output(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, witness_output);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 	struct witness_list *typelist;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 		else
 			typelist = &w_spin;
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
 		typelist = &w_sleep;
 	} else {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 			"lock (%s) %s does not match earlier (%s) lock",
 			description, lock_class->lc_name,
 			w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 	struct witness_list *list;
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		list = &w_sleep;
 		w_sleep_cnt--;
 	} else {
 		list = &w_spin;
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	struct sbuf *sb;
 	u_int w_rmatrix1, w_rmatrix2;
 	int error, generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			req->oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				req->oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 			w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
 			w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		req->oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: stable/11/sys/kern/vfs_cache.c
===================================================================
--- stable/11/sys/kern/vfs_cache.c	(revision 310958)
+++ stable/11/sys/kern/vfs_cache.c	(revision 310959)
@@ -1,1510 +1,2454 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
+#include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/uma.h>
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
     "char *", "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
     "struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
-    "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap_negative, done, "struct vnode *",
+    "char *", "int");
+SDT_PROBE_DEFINE3(vfs, namecache, shrink_negative, done, "struct vnode *",
+    "char *", "int");
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
-	struct	vnode *nc_vp;		/* vnode the name refers to */
+	union {
+		struct	vnode *nu_vp;	/* vnode the name refers to */
+		u_int	nu_neghits;	/* negative entry hits */
+	} n_un;
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * struct namecache_ts repeats struct namecache layout up to the
  * nc_nlen member.
  * struct namecache_ts is used in place of struct namecache when time(s) need
  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  * both a non-dotdot directory name plus dotdot for the directory's
  * parent.
  */
 struct	namecache_ts {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
-	struct	vnode *nc_vp;		/* vnode the name refers to */
+	union {
+		struct	vnode *nu_vp;	/* vnode the name refers to */
+		u_int	nu_neghits;	/* negative entry hits */
+	} n_un;
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	struct	timespec nc_time;	/* timespec provided by fs */
 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
 	int	nc_ticks;		/* ticks value when entry was added */
 	char	nc_name[0];		/* segment name + nul */
 };
 
+#define	nc_vp		n_un.nu_vp
+#define	nc_neghits	n_un.nu_neghits
+
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	0x01
 #define NCF_ISDOTDOT	0x02
 #define	NCF_TS		0x04
 #define	NCF_DTS		0x08
+#define	NCF_DVDROP	0x10
+#define	NCF_NEGATIVE	0x20
+#define	NCF_HOTNEGATIVE	0x40
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME		TYPE	ROLE
+ * vnodelock	mtx	vnode lists and v_cache_dd field protection
+ * bucketlock	rwlock	for access to given set of hash buckets
+ * neglist	mtx	negative entry LRU management
+ *
+ * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
+ * shrinking the LRU list.
+ *
+ * It is legal to take multiple vnodelock and bucketlock locks. The locking
+ * order is lower address first. Both are recursive.
+ *
+ * "." lookups are lockless.
+ *
+ * ".." and vnode -> name lookups require vnodelock.
+ *
+ * name -> vnode lookup requires the relevant bucketlock to be held for reading.
+ *
+ * Insertions and removals of entries require involved vnodes and bucketlocks
+ * to be write-locked to prevent other threads from seeing the entry.
+ *
+ * Some lookups result in removal of the found entry (e.g. getting rid of a
+ * negative entry with the intent to create a positive one), which poses a
+ * problem when multiple threads reach the state. Similarly, two different
+ * threads can purge two different vnodes and try to remove the same name.
+ *
+ * If the already held vnode lock is lower than the second required lock, we
+ * can just take the other lock. However, in the opposite case, this could
+ * deadlock. As such, this is resolved by trylocking and if that fails unlocking
+ * the first node, locking everything in order and revalidating the state.
  */
 
 /*
  * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
-static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
     "Ratio of negative namecache entries");
 static u_long	numneg;			/* number of negative entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
     "Number of negative entries in namecache");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
     "Number of namecache entries");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
     "Number of namecache entries with vnodes held");
 u_int	ncsizefactor = 2;
 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
     "Size factor for namecache");
+static u_int	ncpurgeminvnodes;
+SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
+    "Number of vnodes below which purgevfs ignores the request");
+static u_int	ncneghitsrequeue = 8;
+SYSCTL_UINT(_vfs, OID_AUTO, ncneghitsrequeue, CTLFLAG_RW, &ncneghitsrequeue, 0,
+    "Number of hits to requeue a negative entry in the LRU list");
 
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
-static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+static struct mtx       ncneg_shrink_lock;
+MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg",
+    MTX_DEF);
 
-#define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
-#define	CACHE_RLOCK()		rw_rlock(&cache_lock)
-#define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
-#define	CACHE_WLOCK()		rw_wlock(&cache_lock)
-#define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
+struct neglist {
+	struct mtx		nl_lock;
+	TAILQ_HEAD(, namecache) nl_list;
+} __aligned(CACHE_LINE_SIZE);
 
+static struct neglist *neglists;
+static struct neglist ncneg_hot;
+
+static int	shrink_list_turn;
+
+static u_int	numneglists;
+static inline struct neglist *
+NCP2NEGLIST(struct namecache *ncp)
+{
+
+	return (&neglists[(((uintptr_t)(ncp) >> 8) % numneglists)]);
+}
+
+static u_int   numbucketlocks;
+static struct rwlock_padalign  *bucketlocks;
+#define	HASH2BUCKETLOCK(hash) \
+	((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
+
+static u_int   numvnodelocks;
+static struct mtx *vnodelocks;
+static inline struct mtx *
+VP2VNODELOCK(struct vnode *vp)
+{
+	struct mtx *vlp;
+
+	if (vp == NULL)
+		return (NULL);
+	vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)];
+	return (vlp);
+}
+
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
 static uma_zone_t cache_zone_small_ts;
 static uma_zone_t cache_zone_large;
 static uma_zone_t cache_zone_large_ts;
 
 #define	CACHE_PATH_CUTOFF	35
 
 static struct namecache *
 cache_alloc(int len, int ts)
 {
 
 	if (len > CACHE_PATH_CUTOFF) {
 		if (ts)
 			return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
 		else
 			return (uma_zalloc(cache_zone_large, M_WAITOK));
 	}
 	if (ts)
 		return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
 	else
 		return (uma_zalloc(cache_zone_small, M_WAITOK));
 }
 
 static void
 cache_free(struct namecache *ncp)
 {
 	int ts;
 
 	if (ncp == NULL)
 		return;
 	ts = ncp->nc_flag & NCF_TS;
+	if ((ncp->nc_flag & NCF_DVDROP) != 0)
+		vdrop(ncp->nc_dvp);
 	if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
 		if (ts)
 			uma_zfree(cache_zone_small_ts, ncp);
 		else
 			uma_zfree(cache_zone_small, ncp);
 	} else if (ts)
 		uma_zfree(cache_zone_large_ts, ncp);
 	else
 		uma_zfree(cache_zone_large, ncp);
 }
 
 static char *
 nc_get_name(struct namecache *ncp)
 {
 	struct namecache_ts *ncp_ts;
 
 	if ((ncp->nc_flag & NCF_TS) == 0)
 		return (ncp->nc_name);
 	ncp_ts = (struct namecache_ts *)ncp;
 	return (ncp_ts->nc_name);
 }
 
 static void
 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 {
 
 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 	    (tsp == NULL && ticksp == NULL),
 	    ("No NCF_TS"));
 
 	if (tsp != NULL)
 		*tsp = ((struct namecache_ts *)ncp)->nc_time;
 	if (ticksp != NULL)
 		*ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
 }
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
     "VFS namecache enabled");
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
     sizeof(struct namecache), "sizeof(struct namecache)");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
     "Name cache statistics");
 #define STATNODE_ULONG(name, descr)	\
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 #define STATNODE_COUNTER(name, descr)	\
 	static counter_u64_t name;	\
 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
 STATNODE_ULONG(numneg, "Number of negative cache entries");
 STATNODE_ULONG(numcache, "Number of cache entries");
 STATNODE_COUNTER(numcalls, "Number of cache lookups");
 STATNODE_COUNTER(dothits, "Number of '.' hits");
 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
 STATNODE_COUNTER(nummiss, "Number of cache misses");
 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 STATNODE_COUNTER(numposzaps,
     "Number of cache hits (positive) we do not want to cache");
 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 STATNODE_COUNTER(numnegzaps,
     "Number of cache hits (negative) we do not want to cache");
 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 /* These count for kern___getcwd(), too. */
 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 STATNODE_COUNTER(numfullpathfail2,
     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
-static long numupgrades; STATNODE_ULONG(numupgrades,
-    "Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+    "Number of times zap_and_exit failed to lock");
+static long cache_lock_vnodes_cel_3_failures;
+STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
+    "Number of times 3-way vnode locking failed");
 
-static void cache_zap(struct namecache *ncp);
-static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
-    u_int *buflen);
+static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
+static int cache_yield;
+SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
+    "Number of times cache called yield");
+
+static void
+cache_maybe_yield(void)
+{
+
+	if (should_yield()) {
+		cache_yield++;
+		kern_yield(PRI_USER);
+	}
+}
+
+static inline void
+cache_assert_vlp_locked(struct mtx *vlp)
+{
+
+	if (vlp != NULL)
+		mtx_assert(vlp, MA_OWNED);
+}
+
+static inline void
+cache_assert_vnode_locked(struct vnode *vp)
+{
+	struct mtx *vlp;
+
+	vlp = VP2VNODELOCK(vp);
+	cache_assert_vlp_locked(vlp);
+}
+
 static uint32_t
 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 {
 	uint32_t hash;
 
 	hash = fnv_32_buf(name, len, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	return (hash);
 }
 
+static inline struct rwlock *
+NCP2BUCKETLOCK(struct namecache *ncp)
+{
+	uint32_t hash;
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	return (HASH2BUCKETLOCK(hash));
+}
+
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+	struct rwlock *blp;
+
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_assert(blp, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+#define cache_sort(x, y)	_cache_sort((void **)(x), (void **)(y))
+static void
+_cache_sort(void **p1, void **p2)
+{
+	void *tmp;
+
+	if (*p1 > *p2) {
+		tmp = *p2;
+		*p2 = *p1;
+		*p1 = tmp;
+	}
+}
+
+static void
+cache_lock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wunlock(&bucketlocks[i]);
+}
+
+static void
+cache_lock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_lock(&vnodelocks[i]);
+}
+
+static void
+cache_unlock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_unlock(&vnodelocks[i]);
+}
+
 static int
+cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	cache_sort(&vlp1, &vlp2);
+	MPASS(vlp2 != NULL);
+
+	if (vlp1 != NULL) {
+		if (!mtx_trylock(vlp1))
+			return (EAGAIN);
+	}
+	if (!mtx_trylock(vlp2)) {
+		if (vlp1 != NULL)
+			mtx_unlock(vlp1);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	MPASS(vlp1 != NULL || vlp2 != NULL);
+
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+}
+
+static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
 	struct nchstats snap;
 
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
 
 	snap = nchstats;
 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
 	snap.ncs_neghits = counter_u64_fetch(numneghits);
 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 	    counter_u64_fetch(numnegzaps);
 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
 	    counter_u64_fetch(nummiss);
 
 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 }
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
     "VFS cache effectiveness statistics");
 
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
     "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int i, error, n_nchash, *cntbuf;
 
 retry:
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (req->oldptr == NULL)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
-	CACHE_RLOCK();
+	cache_lock_all_buckets();
 	if (n_nchash != nchash + 1) {
-		CACHE_RUNLOCK();
+		cache_unlock_all_buckets();
 		free(cntbuf, M_TEMP);
 		goto retry;
 	}
 	/* Scan hash tables counting entries */
 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 		LIST_FOREACH(ncp, ncpp, nc_hash)
 			cntbuf[i]++;
-	CACHE_RUNLOCK();
+	cache_unlock_all_buckets();
 	for (error = 0, i = 0; i < n_nchash; i++)
 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 			break;
 	free(cntbuf, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
     "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
-	CACHE_RLOCK();
+	cache_lock_all_buckets();
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
-	CACHE_RUNLOCK();
+	cache_unlock_all_buckets();
 	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
- * cache_zap():
+ * Negative entries management
  *
+ * A variation of LRU scheme is used. New entries are hashed into one of
+ * numneglists cold lists. Entries get promoted to the hot list on first hit.
+ * Partial LRU for the hot list is maintained by requeueing them every
+ * ncneghitsrequeue hits.
+ *
+ * The shrinker will demote hot list head and evict from the cold list in a
+ * round-robin manner.
+ */
+static void
+cache_negative_hit(struct namecache *ncp)
+{
+	struct neglist *neglist;
+	u_int hits;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	hits = atomic_fetchadd_int(&ncp->nc_neghits, 1);
+	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+		if ((hits % ncneghitsrequeue) != 0)
+			return;
+		mtx_lock(&ncneg_hot.nl_lock);
+		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+			TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+			TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+			mtx_unlock(&ncneg_hot.nl_lock);
+			return;
+		}
+		/*
+		 * The shrinker cleared the flag and removed the entry from
+		 * the hot list. Put it back.
+		 */
+	} else {
+		mtx_lock(&ncneg_hot.nl_lock);
+	}
+	neglist = NCP2NEGLIST(ncp);
+	mtx_lock(&neglist->nl_lock);
+	if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
+		ncp->nc_flag |= NCF_HOTNEGATIVE;
+	}
+	mtx_unlock(&neglist->nl_lock);
+	mtx_unlock(&ncneg_hot.nl_lock);
+}
+
+static void
+cache_negative_insert(struct namecache *ncp, bool neg_locked)
+{
+	struct neglist *neglist;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+	neglist = NCP2NEGLIST(ncp);
+	if (!neg_locked) {
+		mtx_lock(&neglist->nl_lock);
+	} else {
+		mtx_assert(&neglist->nl_lock, MA_OWNED);
+	}
+	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+	if (!neg_locked)
+		mtx_unlock(&neglist->nl_lock);
+	atomic_add_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_remove(struct namecache *ncp, bool neg_locked)
+{
+	struct neglist *neglist;
+	bool hot_locked = false;
+	bool list_locked = false;
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+	neglist = NCP2NEGLIST(ncp);
+	if (!neg_locked) {
+		if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+			hot_locked = true;
+			mtx_lock(&ncneg_hot.nl_lock);
+			if (!(ncp->nc_flag & NCF_HOTNEGATIVE)) {
+				list_locked = true;
+				mtx_lock(&neglist->nl_lock);
+			}
+		} else {
+			list_locked = true;
+			mtx_lock(&neglist->nl_lock);
+		}
+	} else {
+		mtx_assert(&neglist->nl_lock, MA_OWNED);
+		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
+	}
+	if (ncp->nc_flag & NCF_HOTNEGATIVE) {
+		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+	} else {
+		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
+	}
+	if (list_locked)
+		mtx_unlock(&neglist->nl_lock);
+	if (hot_locked)
+		mtx_unlock(&ncneg_hot.nl_lock);
+	atomic_subtract_rel_long(&numneg, 1);
+}
+
+static void
+cache_negative_shrink_select(int start, struct namecache **ncpp,
+    struct neglist **neglistpp)
+{
+	struct neglist *neglist;
+	struct namecache *ncp;
+	int i;
+
+	*ncpp = ncp = NULL;
+
+	for (i = start; i < numneglists; i++) {
+		neglist = &neglists[i];
+		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
+			continue;
+		mtx_lock(&neglist->nl_lock);
+		ncp = TAILQ_FIRST(&neglist->nl_list);
+		if (ncp != NULL)
+			break;
+		mtx_unlock(&neglist->nl_lock);
+	}
+
+	*neglistpp = neglist;
+	*ncpp = ncp;
+}
+
+static void
+cache_negative_zap_one(void)
+{
+	struct namecache *ncp, *ncp2;
+	struct neglist *neglist;
+	struct mtx *dvlp;
+	struct rwlock *blp;
+
+	if (!mtx_trylock(&ncneg_shrink_lock))
+		return;
+
+	mtx_lock(&ncneg_hot.nl_lock);
+	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
+	if (ncp != NULL) {
+		neglist = NCP2NEGLIST(ncp);
+		mtx_lock(&neglist->nl_lock);
+		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
+		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
+		ncp->nc_flag &= ~NCF_HOTNEGATIVE;
+		mtx_unlock(&neglist->nl_lock);
+	}
+
+	cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+	shrink_list_turn++;
+	if (shrink_list_turn == numneglists)
+		shrink_list_turn = 0;
+	if (ncp == NULL && shrink_list_turn == 0)
+		cache_negative_shrink_select(shrink_list_turn, &ncp, &neglist);
+	if (ncp == NULL) {
+		mtx_unlock(&ncneg_hot.nl_lock);
+		goto out;
+	}
+
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	blp = NCP2BUCKETLOCK(ncp);
+	mtx_unlock(&neglist->nl_lock);
+	mtx_unlock(&ncneg_hot.nl_lock);
+	mtx_lock(dvlp);
+	rw_wlock(blp);
+	mtx_lock(&ncneg_hot.nl_lock);
+	mtx_lock(&neglist->nl_lock);
+	ncp2 = TAILQ_FIRST(&neglist->nl_list);
+	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
+	    blp != NCP2BUCKETLOCK(ncp2) || !(ncp2->nc_flag & NCF_NEGATIVE)) {
+		ncp = NULL;
+		goto out_unlock_all;
+	}
+	SDT_PROBE3(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
+	    nc_get_name(ncp), ncp->nc_neghits);
+
+	cache_zap_locked(ncp, true);
+out_unlock_all:
+	mtx_unlock(&neglist->nl_lock);
+	mtx_unlock(&ncneg_hot.nl_lock);
+	rw_wunlock(blp);
+	mtx_unlock(dvlp);
+out:
+	mtx_unlock(&ncneg_shrink_lock);
+	cache_free(ncp);
+}
+
+/*
+ * cache_zap_locked():
+ *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp, bool neg_locked)
 {
-	struct vnode *vp;
 
-	rw_assert(&cache_lock, RA_WLOCKED);
-	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
-	if (ncp->nc_vp != NULL) {
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		cache_assert_vnode_locked(ncp->nc_vp);
+	cache_assert_vnode_locked(ncp->nc_dvp);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
+	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 		    nc_get_name(ncp), ncp->nc_vp);
 	} else {
-		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
-		    nc_get_name(ncp));
+		SDT_PROBE3(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+		    nc_get_name(ncp), ncp->nc_neghits);
 	}
-	vp = NULL;
 	LIST_REMOVE(ncp, nc_hash);
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
+		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
+		if (ncp == ncp->nc_vp->v_cache_dd)
+			ncp->nc_vp->v_cache_dd = NULL;
+	} else {
+		cache_negative_remove(ncp, neg_locked);
+	}
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
 		if (ncp == ncp->nc_dvp->v_cache_dd)
 			ncp->nc_dvp->v_cache_dd = NULL;
 	} else {
 		LIST_REMOVE(ncp, nc_src);
 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
-			vp = ncp->nc_dvp;
-			numcachehv--;
+			ncp->nc_flag |= NCF_DVDROP;
+			atomic_subtract_rel_long(&numcachehv, 1);
 		}
 	}
-	if (ncp->nc_vp) {
-		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
-		if (ncp == ncp->nc_vp->v_cache_dd)
-			ncp->nc_vp->v_cache_dd = NULL;
+	atomic_subtract_rel_long(&numcache, 1);
+}
+
+static void
+cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
+{
+	struct rwlock *blp;
+
+	MPASS(ncp->nc_dvp == vp);
+	MPASS(ncp->nc_flag & NCF_NEGATIVE);
+	cache_assert_vnode_locked(vp);
+
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+}
+
+static bool
+cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
+    struct mtx **vlpp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	if (ncp->nc_flag & NCF_NEGATIVE) {
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		return (true);
+	}
+
+	pvlp = VP2VNODELOCK(vp);
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+
+	if (*vlpp == vlp1 || *vlpp == vlp2) {
+		to_unlock = *vlpp;
+		*vlpp = NULL;
 	} else {
-		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-		numneg--;
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_sort(&vlp1, &vlp2);
+		if (vlp1 == pvlp) {
+			mtx_lock(vlp2);
+			to_unlock = vlp2;
+		} else {
+			if (!mtx_trylock(vlp1))
+				goto out_relock;
+			to_unlock = vlp1;
+		}
 	}
-	numcache--;
-	cache_free(ncp);
-	if (vp != NULL)
-		vdrop(vp);
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	if (to_unlock != NULL)
+		mtx_unlock(to_unlock);
+	return (true);
+
+out_relock:
+	mtx_unlock(vlp2);
+	mtx_lock(vlp1);
+	mtx_lock(vlp2);
+	MPASS(*vlpp == NULL);
+	*vlpp = vlp1;
+	return (false);
 }
 
+static int
+cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+	int error = 0;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	pvlp = VP2VNODELOCK(vp);
+	if (ncp->nc_flag & NCF_NEGATIVE) {
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		goto out;
+	}
+
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&vlp1, &vlp2);
+	if (vlp1 == pvlp) {
+		mtx_lock(vlp2);
+		to_unlock = vlp2;
+	} else {
+		if (!mtx_trylock(vlp1)) {
+			error = EAGAIN;
+			goto out;
+		}
+		to_unlock = vlp1;
+	}
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	mtx_unlock(to_unlock);
+out:
+	mtx_unlock(pvlp);
+	return (error);
+}
+
+static int
+cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_RLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = NULL;
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		vlp = VP2VNODELOCK(ncp->nc_vp);
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		rw_runlock(blp);
+		rw_wlock(blp);
+		cache_zap_locked(ncp, false);
+		rw_wunlock(blp);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_runlock(blp);
+	return (EAGAIN);
+}
+
+static int
+cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
+    struct mtx **vlpp1, struct mtx **vlpp2)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = NULL;
+	if (!(ncp->nc_flag & NCF_NEGATIVE))
+		vlp = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&dvlp, &vlp);
+
+	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		*vlpp1 = NULL;
+		*vlpp2 = NULL;
+		return (0);
+	}
+
+	if (*vlpp1 != NULL)
+		mtx_unlock(*vlpp1);
+	if (*vlpp2 != NULL)
+		mtx_unlock(*vlpp2);
+	*vlpp1 = NULL;
+	*vlpp2 = NULL;
+
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_wunlock(blp);
+	*vlpp1 = dvlp;
+	*vlpp2 = vlp;
+	if (*vlpp1 != NULL)
+		mtx_lock(*vlpp1);
+	mtx_lock(*vlpp2);
+	rw_wlock(blp);
+	return (EAGAIN);
+}
+
+static void
+cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
+{
+
+	if (blp != NULL) {
+		rw_runlock(blp);
+		mtx_assert(vlp, MA_NOTOWNED);
+	} else {
+		mtx_unlock(vlp);
+	}
+}
+
 /*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative caching), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.  If the directory vnode is
  * recycled out from under us due to a forced unmount, a status of
  * ENOENT is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
  * not recursively acquired.
  */
 
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
 	struct namecache *ncp;
+	struct rwlock *blp;
+	struct mtx *dvlp, *dvlp2;
 	uint32_t hash;
-	int error, ltype, wlocked;
+	int error, ltype;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
-	wlocked = 0;
-	counter_u64_add(numcalls, 1);
+	blp = NULL;
+	dvlp = VP2VNODELOCK(dvp);
 	error = 0;
+	counter_u64_add(numcalls, 1);
 
-retry_wlocked:
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			counter_u64_add(dothits, 1);
 			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 			if (tsp != NULL)
 				timespecclear(tsp);
 			if (ticksp != NULL)
 				*ticksp = ticks;
 			VREF(*vpp);
 			/*
 			 * When we lookup "." we still can be asked to lock it
 			 * differently...
 			 */
 			ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 			if (ltype != VOP_ISLOCKED(*vpp)) {
 				if (ltype == LK_EXCLUSIVE) {
 					vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 					if ((*vpp)->v_iflag & VI_DOOMED) {
 						/* forced unmount */
 						vrele(*vpp);
 						*vpp = NULL;
 						return (ENOENT);
 					}
 				} else
 					vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 			}
 			return (-1);
 		}
-		if (!wlocked)
-			CACHE_RLOCK();
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			counter_u64_add(dotdothits, 1);
-			if (dvp->v_cache_dd == NULL) {
+			dvlp2 = NULL;
+			mtx_lock(dvlp);
+retry_dotdot:
+			ncp = dvp->v_cache_dd;
+			if (ncp == NULL) {
 				SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 				    "..", NULL);
-				goto unlock;
+				mtx_unlock(dvlp);
+				return (0);
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
-				if (!wlocked && !CACHE_UPGRADE_LOCK())
-					goto wlock;
-				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
-					cache_zap(dvp->v_cache_dd);
-				dvp->v_cache_dd = NULL;
-				CACHE_WUNLOCK();
+				if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+					if (ncp->nc_dvp != dvp)
+						panic("dvp %p v_cache_dd %p\n", dvp, ncp);
+					if (!cache_zap_locked_vnode_kl2(ncp,
+					    dvp, &dvlp2))
+						goto retry_dotdot;
+					MPASS(dvp->v_cache_dd == NULL);
+					mtx_unlock(dvlp);
+					if (dvlp2 != NULL)
+						mtx_unlock(dvlp2);
+					cache_free(ncp);
+				} else {
+					dvp->v_cache_dd = NULL;
+					mtx_unlock(dvlp);
+					if (dvlp2 != NULL)
+						mtx_unlock(dvlp2);
+				}
 				return (0);
 			}
-			ncp = dvp->v_cache_dd;
-			if (ncp->nc_flag & NCF_ISDOTDOT)
-				*vpp = ncp->nc_vp;
-			else
+			if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+				if (ncp->nc_flag & NCF_NEGATIVE)
+					*vpp = NULL;
+				else
+					*vpp = ncp->nc_vp;
+			} else
 				*vpp = ncp->nc_dvp;
 			/* Return failure if negative entry was found. */
 			if (*vpp == NULL)
 				goto negative_success;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
 			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 			    *vpp);
 			cache_out_ts(ncp, tsp, ticksp);
 			if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 			    NCF_DTS && tsp != NULL)
 				*tsp = ((struct namecache_ts *)ncp)->
 				    nc_dotdottime;
 			goto success;
 		}
-	} else if (!wlocked)
-		CACHE_RLOCK();
+	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+	blp = HASH2BUCKETLOCK(hash);
+	rw_rlock(blp);
+
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == NULL) {
 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 		    NULL);
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			counter_u64_add(nummisszap, 1);
 		} else {
 			counter_u64_add(nummiss, 1);
 		}
 		goto unlock;
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		counter_u64_add(numposzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		return (0);
+		goto zap_and_exit;
 	}
 
 	/* We found a "positive" match, return the vnode */
-	if (ncp->nc_vp) {
+	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 		counter_u64_add(numposhits, 1);
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
 		    *vpp);
 		cache_out_ts(ncp, tsp, ticksp);
 		goto success;
 	}
 
 negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		counter_u64_add(numnegzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		return (0);
+		goto zap_and_exit;
 	}
 
-	if (!wlocked && !CACHE_UPGRADE_LOCK())
-		goto wlock;
 	counter_u64_add(numneghits, 1);
-	/*
-	 * We found a "negative" match, so we shift it to the end of
-	 * the "negative" cache entries queue to satisfy LRU.  Also,
-	 * check to see if the entry is a whiteout; indicate this to
-	 * the componentname, if so.
-	 */
-	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+	cache_negative_hit(ncp);
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 	    nc_get_name(ncp));
 	cache_out_ts(ncp, tsp, ticksp);
-	CACHE_WUNLOCK();
+	cache_lookup_unlock(blp, dvlp);
 	return (ENOENT);
 
-wlock:
-	/*
-	 * We need to update the cache after our lookup, so upgrade to
-	 * a write lock and retry the operation.
-	 */
-	CACHE_RUNLOCK();
-	CACHE_WLOCK();
-	numupgrades++;
-	wlocked = 1;
-	goto retry_wlocked;
-
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	MPASS(dvp != *vpp);
 	ltype = 0;	/* silence gcc warning */
 	if (cnp->cn_flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 	}
 	vhold(*vpp);
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	cache_lookup_unlock(blp, dvlp);
 	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0)
 				vput(*vpp);
 			*vpp = NULL;
 			return (ENOENT);
 		}
 	}
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	}
 	return (-1);
 
 unlock:
-	if (wlocked)
-		CACHE_WUNLOCK();
+	cache_lookup_unlock(blp, dvlp);
+	return (0);
+
+zap_and_exit:
+	if (blp != NULL)
+		error = cache_zap_rlocked_bucket(ncp, blp);
 	else
-		CACHE_RUNLOCK();
+		error = cache_zap_locked_vnode(ncp, dvp);
+	if (error != 0) {
+		zap_and_exit_bucket_fail++;
+		cache_maybe_yield();
+		goto retry;
+	}
+	cache_free(ncp);
 	return (0);
 }
 
+struct celockstate {
+	struct mtx *vlp[3];
+	struct rwlock *blp[2];
+};
+CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
+CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
+
+static inline void
+cache_celockstate_init(struct celockstate *cel)
+{
+
+	bzero(cel, sizeof(*cel));
+}
+
+static void
+cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
+    struct vnode *dvp)
+{
+	struct mtx *vlp1, *vlp2;
+
+	MPASS(cel->vlp[0] == NULL);
+	MPASS(cel->vlp[1] == NULL);
+	MPASS(cel->vlp[2] == NULL);
+
+	MPASS(vp != NULL || dvp != NULL);
+
+	vlp1 = VP2VNODELOCK(vp);
+	vlp2 = VP2VNODELOCK(dvp);
+	cache_sort(&vlp1, &vlp2);
+
+	if (vlp1 != NULL) {
+		mtx_lock(vlp1);
+		cel->vlp[0] = vlp1;
+	}
+	mtx_lock(vlp2);
+	cel->vlp[1] = vlp2;
+}
+
+static void
+cache_unlock_vnodes_cel(struct celockstate *cel)
+{
+
+	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
+
+	if (cel->vlp[0] != NULL)
+		mtx_unlock(cel->vlp[0]);
+	if (cel->vlp[1] != NULL)
+		mtx_unlock(cel->vlp[1]);
+	if (cel->vlp[2] != NULL)
+		mtx_unlock(cel->vlp[2]);
+}
+
+static bool
+cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
+{
+	struct mtx *vlp;
+	bool ret;
+
+	cache_assert_vlp_locked(cel->vlp[0]);
+	cache_assert_vlp_locked(cel->vlp[1]);
+	MPASS(cel->vlp[2] == NULL);
+
+	vlp = VP2VNODELOCK(vp);
+	MPASS(vlp != NULL);
+
+	ret = true;
+	if (vlp >= cel->vlp[1]) {
+		mtx_lock(vlp);
+	} else {
+		if (mtx_trylock(vlp))
+			goto out;
+		cache_lock_vnodes_cel_3_failures++;
+		cache_unlock_vnodes_cel(cel);
+		if (vlp < cel->vlp[0]) {
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[0]);
+			mtx_lock(cel->vlp[1]);
+		} else {
+			if (cel->vlp[0] != NULL)
+				mtx_lock(cel->vlp[0]);
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[1]);
+		}
+		ret = false;
+	}
+out:
+	cel->vlp[2] = vlp;
+	return (ret);
+}
+
+static void
+cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
+    struct rwlock *blp2)
+{
+
+	MPASS(cel->blp[0] == NULL);
+	MPASS(cel->blp[1] == NULL);
+
+	cache_sort(&blp1, &blp2);
+
+	if (blp1 != NULL) {
+		rw_wlock(blp1);
+		cel->blp[0] = blp1;
+	}
+	rw_wlock(blp2);
+	cel->blp[1] = blp2;
+}
+
+static void
+cache_unlock_buckets_cel(struct celockstate *cel)
+{
+
+	if (cel->blp[0] != NULL)
+		rw_wunlock(cel->blp[0]);
+	rw_wunlock(cel->blp[1]);
+}
+
 /*
+ * Lock part of the cache affected by the insertion.
+ *
+ * This means vnodelocks for dvp, vp and the relevant bucketlock.
+ * However, insertion can result in removal of an old entry. In this
+ * case we have an additional vnode and bucketlock pair to lock. If the
+ * entry is negative, ncelock is locked instead of the vnode.
+ *
+ * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
+ * preserving the locking order (smaller address first).
+ */
+static void
+cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		if (vp == NULL || vp->v_type != VDIR)
+			break;
+		ncp = vp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == vp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (ncp->nc_flag & NCF_NEGATIVE)
+			break;
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		/*
+		 * All vnodes got re-locked. Re-validate the state and if
+		 * nothing changed we are done. Otherwise restart.
+		 */
+		if (ncp == vp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		ncp = dvp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == dvp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (ncp->nc_flag & NCF_NEGATIVE)
+			break;
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		if (ncp == dvp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_unlock(struct celockstate *cel)
+{
+
+	cache_unlock_buckets_cel(cel);
+	cache_unlock_vnodes_cel(cel);
+}
+
+/*
  * Add an entry to the cache.
  */
 void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
-	struct namecache *ncp, *n2;
+	struct celockstate cel;
+	struct namecache *ncp, *n2, *ndd;
 	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
+	struct neglist *neglist;
 	uint32_t hash;
 	int flag;
 	int len;
+	bool neg_locked;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("cache_enter: Adding a doomed vnode"));
 	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 	    ("cache_enter: Doomed vnode used as src"));
 
 	if (!doingcache)
 		return;
 
 	/*
 	 * Avoid blowout in namecache entries.
 	 */
 	if (numcache >= desiredvnodes * ncsizefactor)
 		return;
 
+	cache_celockstate_init(&cel);
+	ndd = NULL;
 	flag = 0;
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1)
 			return;
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
-			CACHE_WLOCK();
+			len = cnp->cn_namelen;
+			hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+			cache_enter_lock_dd(&cel, dvp, vp, hash);
 			/*
 			 * If dotdot entry already exists, just retarget it
 			 * to new parent vnode, otherwise continue with new
 			 * namecache entry allocation.
 			 */
 			if ((ncp = dvp->v_cache_dd) != NULL &&
 			    ncp->nc_flag & NCF_ISDOTDOT) {
 				KASSERT(ncp->nc_dvp == dvp,
 				    ("wrong isdotdot parent"));
-				if (ncp->nc_vp != NULL) {
+				neg_locked = false;
+				if (ncp->nc_flag & NCF_NEGATIVE || vp == NULL) {
+					neglist = NCP2NEGLIST(ncp);
+					mtx_lock(&ncneg_hot.nl_lock);
+					mtx_lock(&neglist->nl_lock);
+					neg_locked = true;
+				}
+				if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 					    ncp, nc_dst);
 				} else {
-					TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-					numneg--;
+					cache_negative_remove(ncp, true);
 				}
 				if (vp != NULL) {
 					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 					    ncp, nc_dst);
+					ncp->nc_flag &= ~(NCF_NEGATIVE|NCF_HOTNEGATIVE);
 				} else {
-					TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-					numneg++;
+					ncp->nc_flag &= ~(NCF_HOTNEGATIVE);
+					ncp->nc_flag |= NCF_NEGATIVE;
+					cache_negative_insert(ncp, true);
 				}
+				if (neg_locked) {
+					mtx_unlock(&neglist->nl_lock);
+					mtx_unlock(&ncneg_hot.nl_lock);
+				}
 				ncp->nc_vp = vp;
-				CACHE_WUNLOCK();
+				cache_enter_unlock(&cel);
 				return;
 			}
 			dvp->v_cache_dd = NULL;
+			cache_enter_unlock(&cel);
+			cache_celockstate_init(&cel);
 			SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
-			CACHE_WUNLOCK();
 			flag = NCF_ISDOTDOT;
 		}
 	}
 
 	/*
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
+	ncp->nc_flag = flag;
 	ncp->nc_vp = vp;
+	if (vp == NULL)
+		ncp->nc_flag |= NCF_NEGATIVE;
 	ncp->nc_dvp = dvp;
-	ncp->nc_flag = flag;
 	if (tsp != NULL) {
 		n3 = (struct namecache_ts *)ncp;
 		n3->nc_time = *tsp;
 		n3->nc_ticks = ticks;
 		n3->nc_flag |= NCF_TS;
 		if (dtsp != NULL) {
 			n3->nc_dotdottime = *dtsp;
 			n3->nc_flag |= NCF_DTS;
 		}
 	}
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
-	CACHE_WLOCK();
+	cache_enter_lock(&cel, dvp, vp, hash);
 
 	/*
 	 * See if this vnode or negative entry is already in the cache
 	 * with this name.  This can happen with concurrent lookups of
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
 	LIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
 			if (tsp != NULL) {
 				KASSERT((n2->nc_flag & NCF_TS) != 0,
 				    ("no NCF_TS"));
 				n3 = (struct namecache_ts *)n2;
 				n3->nc_time =
 				    ((struct namecache_ts *)ncp)->nc_time;
 				n3->nc_ticks =
 				    ((struct namecache_ts *)ncp)->nc_ticks;
 				if (dtsp != NULL) {
 					n3->nc_dotdottime =
 					    ((struct namecache_ts *)ncp)->
 					    nc_dotdottime;
+					if (ncp->nc_flag & NCF_NEGATIVE)
+						mtx_lock(&ncneg_hot.nl_lock);
 					n3->nc_flag |= NCF_DTS;
+					if (ncp->nc_flag & NCF_NEGATIVE)
+						mtx_unlock(&ncneg_hot.nl_lock);
 				}
 			}
-			CACHE_WUNLOCK();
-			cache_free(ncp);
-			return;
+			goto out_unlock_free;
 		}
 	}
 
 	if (flag == NCF_ISDOTDOT) {
 		/*
 		 * See if we are trying to add .. entry, but some other lookup
 		 * has populated v_cache_dd pointer already.
 		 */
-		if (dvp->v_cache_dd != NULL) {
-			CACHE_WUNLOCK();
-			cache_free(ncp);
-			return;
-		}
+		if (dvp->v_cache_dd != NULL)
+			goto out_unlock_free;
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
 		dvp->v_cache_dd = ncp;
 	}
 
-	numcache++;
+	atomic_add_rel_long(&numcache, 1);
 	if (vp != NULL) {
 		if (vp->v_type == VDIR) {
 			if (flag != NCF_ISDOTDOT) {
 				/*
 				 * For this case, the cache entry maps both the
 				 * directory name in it and the name ".." for the
 				 * directory's parent.
 				 */
-				if ((n2 = vp->v_cache_dd) != NULL &&
-				    (n2->nc_flag & NCF_ISDOTDOT) != 0)
-					cache_zap(n2);
+				if ((ndd = vp->v_cache_dd) != NULL) {
+					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
+						cache_zap_locked(ndd, false);
+					else
+						ndd = NULL;
+				}
 				vp->v_cache_dd = ncp;
 			}
 		} else {
 			vp->v_cache_dd = NULL;
 		}
 	}
 
-	/*
-	 * Insert the new namecache entry into the appropriate chain
-	 * within the cache entries table.
-	 */
-	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			vhold(dvp);
-			numcachehv++;
+			atomic_add_rel_long(&numcachehv, 1);
 		}
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
 	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
+	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp != NULL) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 		SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
 		    vp);
 	} else {
 		if (cnp->cn_flags & ISWHITEOUT)
 			ncp->nc_flag |= NCF_WHITE;
-		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-		numneg++;
+		cache_negative_insert(ncp, false);
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    nc_get_name(ncp));
 	}
-	if (numneg * ncnegfactor > numcache) {
-		ncp = TAILQ_FIRST(&ncneg);
-		KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg",
-		    ncp, ncp->nc_vp));
-		cache_zap(ncp);
-	}
-	CACHE_WUNLOCK();
+	cache_enter_unlock(&cel);
+	if (numneg * ncnegfactor > numcache)
+		cache_negative_zap_one();
+	cache_free(ndd);
+	return;
+out_unlock_free:
+	cache_enter_unlock(&cel);
+	cache_free(ncp);
+	return;
 }
 
+static u_int
+cache_roundup_2(u_int val)
+{
+	u_int res;
+
+	for (res = 1; res <= val; res <<= 1)
+		continue;
+
+	return (res);
+}
+
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
+	u_int i;
 
-	TAILQ_INIT(&ncneg);
-
 	cache_zone_small = uma_zcreate("S VFS Cache",
 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache",
 	    sizeof(struct namecache) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+	numbucketlocks = cache_roundup_2(mp_ncpus * 64);
+	if (numbucketlocks > nchash + 1)
+		numbucketlocks = nchash + 1;
+	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numbucketlocks; i++)
+		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
+	numvnodelocks = cache_roundup_2(mp_ncpus * 64);
+	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
+	ncpurgeminvnodes = numbucketlocks;
 
+	numneglists = 4;
+	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numneglists; i++) {
+		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
+		TAILQ_INIT(&neglists[i].nl_list);
+	}
+	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
+	TAILQ_INIT(&ncneg_hot.nl_list);
+
 	numcalls = counter_u64_alloc(M_WAITOK);
 	dothits = counter_u64_alloc(M_WAITOK);
 	dotdothits = counter_u64_alloc(M_WAITOK);
 	numchecks = counter_u64_alloc(M_WAITOK);
 	nummiss = counter_u64_alloc(M_WAITOK);
 	nummisszap = counter_u64_alloc(M_WAITOK);
 	numposzaps = counter_u64_alloc(M_WAITOK);
 	numposhits = counter_u64_alloc(M_WAITOK);
 	numnegzaps = counter_u64_alloc(M_WAITOK);
 	numneghits = counter_u64_alloc(M_WAITOK);
 	numfullpathcalls = counter_u64_alloc(M_WAITOK);
 	numfullpathfail1 = counter_u64_alloc(M_WAITOK);
 	numfullpathfail2 = counter_u64_alloc(M_WAITOK);
 	numfullpathfail4 = counter_u64_alloc(M_WAITOK);
 	numfullpathfound = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
 void
 cache_changesize(int newmaxvnodes)
 {
 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
 	u_long new_nchash, old_nchash;
 	struct namecache *ncp;
 	uint32_t hash;
 	int i;
 
-	new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash);
+	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
+	if (newmaxvnodes < numbucketlocks)
+		newmaxvnodes = numbucketlocks;
+
+	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
 	/* If same hash table size, nothing to do */
 	if (nchash == new_nchash) {
 		free(new_nchashtbl, M_VFSCACHE);
 		return;
 	}
 	/*
 	 * Move everything from the old hash table to the new table.
 	 * None of the namecache entries in the table can be removed
 	 * because to do so, they have to be removed from the hash table.
 	 */
-	CACHE_WLOCK();
+	cache_lock_all_vnodes();
+	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	nchashtbl = new_nchashtbl;
 	nchash = new_nchash;
 	for (i = 0; i <= old_nchash; i++) {
 		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 			hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen,
 			    ncp->nc_dvp);
 			LIST_REMOVE(ncp, nc_hash);
 			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
-	CACHE_WUNLOCK();
+	cache_unlock_all_buckets();
+	cache_unlock_all_vnodes();
 	free(old_nchashtbl, M_VFSCACHE);
 }
 
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
 cache_purge(struct vnode *vp)
 {
+	TAILQ_HEAD(, namecache) ncps;
+	struct namecache *ncp, *nnp;
+	struct mtx *vlp, *vlp2;
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
 	SDT_PROBE1(vfs, namecache, purge, done, vp);
-	CACHE_WLOCK();
-	while (!LIST_EMPTY(&vp->v_cache_src))
-		cache_zap(LIST_FIRST(&vp->v_cache_src));
-	while (!TAILQ_EMPTY(&vp->v_cache_dst))
-		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
-	if (vp->v_cache_dd != NULL) {
-		KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
+	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
+	    vp->v_cache_dd == NULL)
+		return;
+	TAILQ_INIT(&ncps);
+	vlp = VP2VNODELOCK(vp);
+	vlp2 = NULL;
+	mtx_lock(vlp);
+retry:
+	while (!LIST_EMPTY(&vp->v_cache_src)) {
+		ncp = LIST_FIRST(&vp->v_cache_src);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
+		ncp = TAILQ_FIRST(&vp->v_cache_dst);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	}
+	ncp = vp->v_cache_dd;
+	if (ncp != NULL) {
+		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 		   ("lost dotdot link"));
-		cache_zap(vp->v_cache_dd);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
-	CACHE_WUNLOCK();
+	mtx_unlock(vlp);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
 }
 
 /*
  * Invalidate all negative entries for a particular directory vnode.
  */
 void
 cache_purge_negative(struct vnode *vp)
 {
-	struct namecache *cp, *ncp;
+	TAILQ_HEAD(, namecache) ncps;
+	struct namecache *ncp, *nnp;
+	struct mtx *vlp;
 
 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
-	CACHE_WLOCK();
-	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
-		if (cp->nc_vp == NULL)
-			cache_zap(cp);
+	TAILQ_INIT(&ncps);
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
+	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
+		if (!(ncp->nc_flag & NCF_NEGATIVE))
+			continue;
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
-	CACHE_WUNLOCK();
+	mtx_unlock(vlp);
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
-cache_purgevfs(struct mount *mp)
+cache_purgevfs(struct mount *mp, bool force)
 {
-	struct nchashhead *ncpp;
+	TAILQ_HEAD(, namecache) ncps;
+	struct mtx *vlp1, *vlp2;
+	struct rwlock *blp;
+	struct nchashhead *bucket;
 	struct namecache *ncp, *nnp;
+	u_long i, j, n_nchash;
+	int error;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
-	CACHE_WLOCK();
-	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
-			if (ncp->nc_dvp->v_mount == mp)
-				cache_zap(ncp);
+	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
+		return;
+	TAILQ_INIT(&ncps);
+	n_nchash = nchash + 1;
+	vlp1 = vlp2 = NULL;
+	for (i = 0; i < numbucketlocks; i++) {
+		blp = (struct rwlock *)&bucketlocks[i];
+		rw_wlock(blp);
+		for (j = i; j < n_nchash; j += numbucketlocks) {
+retry:
+			bucket = &nchashtbl[j];
+			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+				cache_assert_bucket_locked(ncp, RA_WLOCKED);
+				if (ncp->nc_dvp->v_mount != mp)
+					continue;
+				error = cache_zap_wlocked_bucket_kl(ncp, blp,
+				    &vlp1, &vlp2);
+				if (error != 0)
+					goto retry;
+				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+			}
 		}
+		rw_wunlock(blp);
+		if (vlp1 == NULL && vlp2 == NULL)
+			cache_maybe_yield();
 	}
-	CACHE_WUNLOCK();
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+
+	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
+		cache_free(ncp);
+	}
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
 /* Implementation of the getcwd syscall. */
 int
 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 {
 
 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
 	    MAXPATHLEN));
 }
 
 int
 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen,
     u_int path_max)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	struct vnode *cdir, *rdir;
 	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
 	if (buflen > path_max)
 		buflen = path_max;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	vrefact(cdir);
 	rdir = fdp->fd_rdir;
 	vrefact(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 	vrele(rdir);
 	vrele(cdir);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
 			bcopy(bp, buf, strlen(bp) + 1);
 		else
 			error = copyout(bp, buf, strlen(bp) + 1);
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(bp);
 #endif
 	}
 	free(tmpbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
     "Disable the vn_fullpath function");
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *buf;
 	struct filedesc *fdp;
 	struct vnode *rdir;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	rdir = fdp->fd_rdir;
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 	vrele(rdir);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
  * auditing sub-system, as audited pathnames must be absolute, relative to the
  * global root mount point.
  */
 int
 vn_fullpath_global(struct thread *td, struct vnode *vn,
     char **retbuf, char **freebuf)
 {
 	char *buf;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 {
-	int error;
-
-	CACHE_RLOCK();
-	error = vn_vptocnp_locked(vp, cred, buf, buflen);
-	if (error == 0)
-		CACHE_RUNLOCK();
-	return (error);
-}
-
-static int
-vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
-    u_int *buflen)
-{
 	struct vnode *dvp;
 	struct namecache *ncp;
+	struct mtx *vlp;
 	int error;
 
+	vlp = VP2VNODELOCK(*vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	}
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
-			CACHE_RUNLOCK();
+			mtx_unlock(vlp);
 			vrele(*vp);
 			counter_u64_add(numfullpathfail4, 1);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    vp, NULL);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    nc_get_name(ncp), vp);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		vrele(dvp);
-		CACHE_RLOCK();
 		return (0);
 	}
 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 	vput(*vp);
 	if (error) {
 		counter_u64_add(numfullpathfail2, 1);
 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 		return (error);
 	}
 
 	*vp = dvp;
-	CACHE_RLOCK();
 	if (dvp->v_iflag & VI_DOOMED) {
 		/* forced unmount */
-		CACHE_RUNLOCK();
 		vrele(dvp);
 		error = ENOENT;
 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 		return (error);
 	}
 	/*
 	 * *vp has its use count incremented still.
 	 */
 
 	return (0);
 }
 
 /*
  * The magic behind kern___getcwd() and vn_fullpath().
  */
 static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
 	int error, slash_prefixed;
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *vp1;
 
 	buflen--;
 	buf[buflen] = '\0';
 	error = 0;
 	slash_prefixed = 0;
 
 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 	counter_u64_add(numfullpathcalls, 1);
 	vref(vp);
-	CACHE_RLOCK();
 	if (vp->v_type != VDIR) {
-		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			return (error);
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	while (vp != rdir && vp != rootvnode) {
-		if (vp->v_vflag & VV_ROOT) {
-			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
-				CACHE_RUNLOCK();
-				vrele(vp);
+		/*
+		 * The vp vnode must be already fully constructed,
+		 * since it is either found in namecache or obtained
+		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
+		 * without obtaining the vnode lock.
+		 */
+		if ((vp->v_vflag & VV_ROOT) != 0) {
+			vn_lock(vp, LK_RETRY | LK_SHARED);
+
+			/*
+			 * With the vnode locked, check for races with
+			 * unmount, forced or not.  Note that we
+			 * already verified that vp is not equal to
+			 * the root vnode, which means that
+			 * mnt_vnodecovered can be NULL only for the
+			 * case of unmount.
+			 */
+			if ((vp->v_iflag & VI_DOOMED) != 0 ||
+			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
+			    vp1->v_mountedhere != vp->v_mount) {
+				vput(vp);
 				error = ENOENT;
 				SDT_PROBE3(vfs, namecache, fullpath, return,
 				    error, vp, NULL);
 				break;
 			}
-			vp1 = vp->v_mount->mnt_vnodecovered;
+
 			vref(vp1);
-			CACHE_RUNLOCK();
-			vrele(vp);
+			vput(vp);
 			vp = vp1;
-			CACHE_RLOCK();
 			continue;
 		}
 		if (vp->v_type != VDIR) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			counter_u64_add(numfullpathfail1, 1);
 			error = ENOTDIR;
 			SDT_PROBE3(vfs, namecache, fullpath, return,
 			    error, vp, NULL);
 			break;
 		}
-		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			break;
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    startvp, NULL);
 			break;
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	if (error)
 		return (error);
 	if (!slash_prefixed) {
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			counter_u64_add(numfullpathfail4, 1);
 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 	}
 	counter_u64_add(numfullpathfound, 1);
-	CACHE_RUNLOCK();
 	vrele(vp);
 
 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 	*retbuf = buf + buflen;
 	return (0);
 }
 
 struct vnode *
 vn_dir_dd_ino(struct vnode *vp)
 {
 	struct namecache *ncp;
 	struct vnode *ddvp;
+	struct mtx *vlp;
 
 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
-	CACHE_RLOCK();
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
 		vhold(ddvp);
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
 			return (NULL);
 		return (ddvp);
 	}
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	return (NULL);
 }
 
 int
 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 {
 	struct namecache *ncp;
+	struct mtx *vlp;
 	int l;
 
-	CACHE_RLOCK();
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	if (ncp == NULL) {
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		return (ENOENT);
 	}
 	l = min(ncp->nc_nlen, buflen - 1);
 	memcpy(buf, nc_get_name(ncp), l);
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	buf[l] = '\0';
 	return (0);
 }
 
 /* ABI compat shims for old kernel modules. */
 #undef cache_enter
 
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 
 void
 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 
 	cache_enter_time(dvp, vp, cnp, NULL, NULL);
 }
 
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
  *
  * Requires a locked, referenced vnode.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  *
  * If sysctl debug.disablefullpath is set, ENODEV is returned,
  * vnode is left locked and path remain untouched.
  *
  * If vp is a directory, the call to vn_fullpath_global() always succeeds
  * because it falls back to the ".." lookup if the namecache lookup fails.
  */
 int
 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
     u_int pathlen)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Return ENODEV if sysctl debug.disablefullpath==1 */
 	if (disablefullpath)
 		return (ENODEV);
 
 	/* Construct global filesystem path from vp. */
 	VOP_UNLOCK(vp, 0);
 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, path, td);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
Index: stable/11/sys/kern/vfs_mount.c
===================================================================
--- stable/11/sys/kern/vfs_mount.c	(revision 310958)
+++ stable/11/sys/kern/vfs_mount.c	(revision 310959)
@@ -1,2009 +1,2009 @@
 /*-
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
 
 static int	vfs_domount(struct thread *td, const char *fstype, char *fspath,
 		    uint64_t fsflags, struct vfsoptlist **optlist);
 static void	free_mntarg(struct mntarg *ma);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
 
 /* For any iteration/modification of mountlist */
 struct mtx mountlist_mtx;
 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 
 /*
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
 	"errmsg",
 	"fstype",
 	"fspath",
 	"ro",
 	"rw",
 	"nosuid",
 	"noexec",
 	NULL
 };
 
 static int
 mount_init(void *mem, int size, int flags)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
 	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
 	return (0);
 }
 
 static void
 mount_fini(void *mem, int size)
 {
 	struct mount *mp;
 
 	mp = (struct mount *)mem;
 	lockdestroy(&mp->mnt_explock);
 	mtx_destroy(&mp->mnt_mtx);
 }
 
 static void
 vfs_mount_init(void *dummy __unused)
 {
 
 	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
 	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
 
 /*
  * ---------------------------------------------------------------------
  * Functions for building and sanitizing the mount options
  */
 
 /* Remove one mount option. */
 static void
 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
 {
 
 	TAILQ_REMOVE(opts, opt, link);
 	free(opt->name, M_MOUNT);
 	if (opt->value != NULL)
 		free(opt->value, M_MOUNT);
 	free(opt, M_MOUNT);
 }
 
 /* Release all resources related to the mount options. */
 void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
 
 	while (!TAILQ_EMPTY(opts)) {
 		opt = TAILQ_FIRST(opts);
 		vfs_freeopt(opts, opt);
 	}
 	free(opts, M_MOUNT);
 }
 
 void
 vfs_deleteopt(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt, *temp;
 
 	if (opts == NULL)
 		return;
 	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
 		if (strcmp(opt->name, name) == 0)
 			vfs_freeopt(opts, opt);
 	}
 }
 
 static int
 vfs_isopt_ro(const char *opt)
 {
 
 	if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 ||
 	    strcmp(opt, "norw") == 0)
 		return (1);
 	return (0);
 }
 
 static int
 vfs_isopt_rw(const char *opt)
 {
 
 	if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0)
 		return (1);
 	return (0);
 }
 
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
 static int
 vfs_equalopts(const char *opt1, const char *opt2)
 {
 	char *p;
 
 	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
 	if (strcmp(opt1, opt2) == 0)
 		return (1);
 	/* "noopt" vs. "opt" */
 	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 		return (1);
 	/* "opt" vs. "noopt" */
 	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 		return (1);
 	while ((p = strchr(opt1, '.')) != NULL &&
 	    !strncmp(opt1, opt2, ++p - opt1)) {
 		opt2 += p - opt1;
 		opt1 = p;
 		/* "foo.noopt" vs. "foo.opt" */
 		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
 			return (1);
 		/* "foo.opt" vs. "foo.noopt" */
 		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
 			return (1);
 	}
 	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
 	if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) &&
 	    (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2)))
 		return (1);
 	return (0);
 }
 
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
  * the last occurrence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt, *opt2, *tmp;
 
 	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
 		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
 		while (opt2 != NULL) {
 			if (vfs_equalopts(opt->name, opt2->name)) {
 				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
 				vfs_freeopt(opts, opt2);
 				opt2 = tmp;
 			} else {
 				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
 			}
 		}
 	}
 }
 
 /*
  * Build a linked list of mount options from a struct uio.
  */
 int
 vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
 {
 	struct vfsoptlist *opts;
 	struct vfsopt *opt;
 	size_t memused, namelen, optlen;
 	unsigned int i, iovcnt;
 	int error;
 
 	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 	TAILQ_INIT(opts);
 	memused = 0;
 	iovcnt = auio->uio_iovcnt;
 	for (i = 0; i < iovcnt; i += 2) {
 		namelen = auio->uio_iov[i].iov_len;
 		optlen = auio->uio_iov[i + 1].iov_len;
 		memused += sizeof(struct vfsopt) + optlen + namelen;
 		/*
 		 * Avoid consuming too much memory, and attempts to overflow
 		 * memused.
 		 */
 		if (memused > VFS_MOUNTARG_SIZE_MAX ||
 		    optlen > VFS_MOUNTARG_SIZE_MAX ||
 		    namelen > VFS_MOUNTARG_SIZE_MAX) {
 			error = EINVAL;
 			goto bad;
 		}
 
 		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
 		opt->value = NULL;
 		opt->len = 0;
 		opt->pos = i / 2;
 		opt->seen = 0;
 
 		/*
 		 * Do this early, so jumps to "bad" will free the current
 		 * option.
 		 */
 		TAILQ_INSERT_TAIL(opts, opt, link);
 
 		if (auio->uio_segflg == UIO_SYSSPACE) {
 			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
 		} else {
 			error = copyin(auio->uio_iov[i].iov_base, opt->name,
 			    namelen);
 			if (error)
 				goto bad;
 		}
 		/* Ensure names are null-terminated strings. */
 		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
 			error = EINVAL;
 			goto bad;
 		}
 		if (optlen != 0) {
 			opt->len = optlen;
 			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
 			if (auio->uio_segflg == UIO_SYSSPACE) {
 				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
 				    optlen);
 			} else {
 				error = copyin(auio->uio_iov[i + 1].iov_base,
 				    opt->value, optlen);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	vfs_sanitizeopts(opts);
 	*options = opts;
 	return (0);
 bad:
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Merge the old mount options with the new ones passed
  * in the MNT_UPDATE case.
  *
  * XXX: This function will keep a "nofoo" option in the new
  * options.  E.g, if the option's canonical name is "foo",
  * "nofoo" ends up in the mount point's active options.
  */
 static void
 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts)
 {
 	struct vfsopt *opt, *new;
 
 	TAILQ_FOREACH(opt, oldopts, link) {
 		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
 		new->name = strdup(opt->name, M_MOUNT);
 		if (opt->len != 0) {
 			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
 			bcopy(opt->value, new->value, opt->len);
 		} else
 			new->value = NULL;
 		new->len = opt->len;
 		new->seen = opt->seen;
 		TAILQ_INSERT_HEAD(toopts, new, link);
 	}
 	vfs_sanitizeopts(toopts);
 }
 
 /*
  * Mount a filesystem.
  */
 int
 sys_nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		struct iovec *iovp;
 		unsigned int iovcnt;
 		int flags;
 	} */ *uap;
 {
 	struct uio *auio;
 	int error;
 	u_int iovcnt;
 	uint64_t flags;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit archtectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
 	    uap->iovp, uap->iovcnt, flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
 	 * Check that we have an even number of iovec's
 	 * and that we have at least two options.
 	 */
 	if ((iovcnt & 1) || (iovcnt < 4)) {
 		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
 		    uap->iovcnt);
 		return (EINVAL);
 	}
 
 	error = copyinuio(uap->iovp, iovcnt, &auio);
 	if (error) {
 		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
 		    __func__, error);
 		return (error);
 	}
 	error = vfs_donmount(td, flags, auio);
 
 	free(auio, M_IOV);
 	return (error);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Various utility functions
  */
 
 void
 vfs_ref(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 }
 
 void
 vfs_rel(struct mount *mp)
 {
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Allocate and initialize the mount point struct.
  */
 struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
     struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = uma_zalloc(mount_zone, M_WAITOK);
 	bzero(&mp->mnt_startzero,
 	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
 	TAILQ_INIT(&mp->mnt_activevnodelist);
 	mp->mnt_activevnodelistsize = 0;
 	mp->mnt_ref = 0;
 	(void) vfs_busy(mp, MBF_NOWAIT);
 	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(cred);
 	mp->mnt_stat.f_owner = cred->cr_uid;
 	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_iosize_max = DFLTPHYS;
 #ifdef MAC
 	mac_mount_init(mp);
 	mac_mount_create(cred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
 	TAILQ_INIT(&mp->mnt_uppers);
 	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
 void
 vfs_mount_destroy(struct mount *mp)
 {
 
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	while (mp->mnt_ref)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
 	KASSERT(mp->mnt_ref == 0,
 	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
 	    __FILE__, __LINE__));
 	if (mp->mnt_writeopcount != 0)
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
 	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			vn_printf(vp, "dangling vnode ");
 		panic("unmount: dangling vnode");
 	}
 	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	if (mp->mnt_activevnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero activevnodelistsize");
 	if (mp->mnt_lockref != 0)
 		panic("vfs_mount_destroy: nonzero lock refcount");
 	MNT_IUNLOCK(mp);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
 	uma_zfree(mount_zone, mp);
 }
 
 int
 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
 	struct vfsopt *opt, *tmp_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
 
 	errmsg = fspath = NULL;
 	errmsg_len = fspathlen = 0;
 	errmsg_pos = -1;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
 	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
 		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
 
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
 	 * Ensure they are NUL terminated as well.
 	 */
 	fstypelen = 0;
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
 		if (errmsg != NULL)
 			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
 	/*
 	 * We need to see if we have the "update" option
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
 	 */
 	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
 		if (strcmp(opt->name, "update") == 0) {
 			fsflags |= MNT_UPDATE;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "async") == 0)
 			fsflags |= MNT_ASYNC;
 		else if (strcmp(opt->name, "force") == 0) {
 			fsflags |= MNT_FORCE;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "reload") == 0) {
 			fsflags |= MNT_RELOAD;
 			vfs_freeopt(optlist, opt);
 		}
 		else if (strcmp(opt->name, "multilabel") == 0)
 			fsflags |= MNT_MULTILABEL;
 		else if (strcmp(opt->name, "noasync") == 0)
 			fsflags &= ~MNT_ASYNC;
 		else if (strcmp(opt->name, "noatime") == 0)
 			fsflags |= MNT_NOATIME;
 		else if (strcmp(opt->name, "atime") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoatime", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterr") == 0)
 			fsflags |= MNT_NOCLUSTERR;
 		else if (strcmp(opt->name, "clusterr") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterr", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noclusterw") == 0)
 			fsflags |= MNT_NOCLUSTERW;
 		else if (strcmp(opt->name, "clusterw") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoclusterw", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noexec") == 0)
 			fsflags |= MNT_NOEXEC;
 		else if (strcmp(opt->name, "exec") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonoexec", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosuid") == 0)
 			fsflags |= MNT_NOSUID;
 		else if (strcmp(opt->name, "suid") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
 		else if (strcmp(opt->name, "noro") == 0)
 			fsflags &= ~MNT_RDONLY;
 		else if (strcmp(opt->name, "rw") == 0)
 			fsflags &= ~MNT_RDONLY;
 		else if (strcmp(opt->name, "ro") == 0)
 			fsflags |= MNT_RDONLY;
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
 		}
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
 		else if (strcmp(opt->name, "automounted") == 0) {
 			fsflags |= MNT_AUTOMOUNTED;
 			vfs_freeopt(optlist, opt);
 		}
 	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
 	    && errmsg_len > 0 && errmsg != NULL) {
 		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
 			bcopy(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		} else {
 			copyout(errmsg,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
 			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
 		}
 	}
 
 	if (optlist != NULL)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		char *type;
 		char *path;
 		int flags;
 		caddr_t data;
 	} */ *uap;
 {
 	char *fstype;
 	struct vfsconf *vfsp = NULL;
 	struct mntarg *ma = NULL;
 	uint64_t flags;
 	int error;
 
 	/*
 	 * Mount flags are now 64-bits. On 32-bit architectures only
 	 * 32-bits are passed in, but from here on everything handles
 	 * 64-bit flags correctly.
 	 */
 	flags = uap->flags;
 
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
 	 * userspace to set this flag, but we must filter it out if we want
 	 * MNT_UPDATE on the root file system to work.
 	 * MNT_ROOTFS should only be set by the kernel when mounting its
 	 * root file system.
 	 */
 	flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
 	if (error) {
 		free(fstype, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_TEXT(fstype);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
 	if (vfsp == NULL)
 		return (ENOENT);
 	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
 		return (EOPNOTSUPP);
 
 	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
 	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
 	return (error);
 }
 
 /*
  * vfs_domount_first(): first file system mount (not update)
  */
 static int
 vfs_domount_first(
 	struct thread *td,		/* Calling thread. */
 	struct vfsconf *vfsp,		/* File system type. */
 	char *fspath,			/* Mount path. */
 	struct vnode *vp,		/* Vnode to be covered. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vattr va;
 	struct mount *mp;
 	struct vnode *newdp;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
 
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
 		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
 	if (error == 0)
 		error = vinvalbuf(vp, V_SAVE, 0, 0);
 	if (error == 0 && vp->v_type != VDIR)
 		error = ENOTDIR;
 	if (error == 0) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
 			vp->v_iflag |= VI_MOUNT;
 		else
 			error = EBUSY;
 		VI_UNLOCK(vp);
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0);
 
 	/* Allocate and initialize the filesystem. */
 	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
 	/* XXXMAC: pass to vfs_mount_alloc? */
 	mp->mnt_optnew = *optlist;
 	/* Set the mount level flags. */
 	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp);
 	if (error != 0) {
 		vfs_unbusy(mp);
 		vfs_mount_destroy(mp);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
 		vrele(vp);
 		return (error);
 	}
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 	(void)VFS_STATFS(mp, &mp->mnt_stat);
 
 	/*
 	 * Prevent external consumers of mount options from reading mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	cache_purge(vp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vp->v_mountedhere = mp;
 	/* Place the new filesystem at the end of the mount list. */
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
 		panic("mount: lost mount");
 	VOP_UNLOCK(vp, 0);
 	EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td);
 	VOP_UNLOCK(newdp, 0);
 	mountcheckdirs(vp, newdp);
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	vfs_unbusy(mp);
 	return (0);
 }
 
 /*
  * vfs_domount_update(): update of mounted file system
  */
 static int
 vfs_domount_update(
 	struct thread *td,		/* Calling thread. */
 	struct vnode *vp,		/* Mount point vnode. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct export_args export;
 	void *bufp;
 	struct mount *mp;
 	int error, export_error, len;
 	uint64_t flag;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
 	mp = vp->v_mount;
 
 	if ((vp->v_vflag & VV_ROOT) == 0) {
 		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
 		    == 0)
 			error = EXDEV;
 		else
 			error = EINVAL;
 		vput(vp);
 		return (error);
 	}
 
 	/*
 	 * We only allow the filesystem to be reloaded if it
 	 * is currently mounted read-only.
 	 */
 	flag = mp->mnt_flag;
 	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
 		vput(vp);
 		return (EOPNOTSUPP);	/* Needs translation */
 	}
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that
 	 * did the original mount is permitted to update it.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vfs_busy(mp, MBF_NOWAIT)) {
 		vput(vp);
 		return (EBUSY);
 	}
 	VI_LOCK(vp);
 	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
 		VI_UNLOCK(vp);
 		vfs_unbusy(mp);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_iflag |= VI_MOUNT;
 	VI_UNLOCK(vp);
 	VOP_UNLOCK(vp, 0);
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		MNT_IUNLOCK(mp);
 		error = EBUSY;
 		goto end;
 	}
 	mp->mnt_flag &= ~MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
 	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
 	if ((mp->mnt_flag & MNT_ASYNC) == 0)
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	mp->mnt_optnew = *optlist;
 	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
 
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp);
 
 	export_error = 0;
 	/* Process the export option. */
 	if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp,
 	    &len) == 0) {
 		/* Assume that there is only 1 ABI for each length. */
 		switch (len) {
 		case (sizeof(struct oexport_args)):
 			bzero(&export, sizeof(export));
 			/* FALLTHROUGH */
 		case (sizeof(export)):
 			bcopy(bufp, &export, len);
 			export_error = vfs_export(mp, &export);
 			break;
 		default:
 			export_error = EINVAL;
 			break;
 		}
 	}
 
 	MNT_ILOCK(mp);
 	if (error == 0) {
 		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
 		    MNT_SNAPSHOT);
 	} else {
 		/*
 		 * If we fail, restore old mount flags. MNT_QUOTA is special,
 		 * because it is not part of MNT_UPDATEMASK, but it could have
 		 * changed in the meantime if quotactl(2) was called.
 		 * All in all we want current value of MNT_QUOTA, not the old
 		 * one.
 		 */
 		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
 	}
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	else
 		mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 
 	if (error != 0)
 		goto end;
 
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	mp->mnt_opt = mp->mnt_optnew;
 	*optlist = NULL;
 	(void)VFS_STATFS(mp, &mp->mnt_stat);
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
 	 */
 	mp->mnt_optnew = NULL;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		vfs_allocate_syncvnode(mp);
 	else
 		vfs_deallocate_syncvnode(mp);
 end:
 	vfs_unbusy(mp);
 	VI_LOCK(vp);
 	vp->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vp);
 	vrele(vp);
 	return (error != 0 ? error : export_error);
 }
 
 /*
  * vfs_domount(): actually attempt a filesystem mount.
  */
 static int
 vfs_domount(
 	struct thread *td,		/* Calling thread. */
 	const char *fstype,		/* Filesystem type. */
 	char *fspath,			/* Mount path. */
 	uint64_t fsflags,		/* Flags common to all filesystems. */
 	struct vfsoptlist **optlist	/* Options local to the filesystem. */
 	)
 {
 	struct vfsconf *vfsp;
 	struct nameidata nd;
 	struct vnode *vp;
 	char *pathbuf;
 	int error;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
 	if (jailed(td->td_ucred) || usermount == 0) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
 		if (error)
 			return (error);
 	}
 	if (fsflags & MNT_SUIDDIR) {
 		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
 	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
 		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
 			fsflags |= MNT_NOSUID | MNT_USER;
 	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		/* Don't try to load KLDs if we're mounting the root. */
 		if (fsflags & MNT_ROOTFS)
 			vfsp = vfs_byname(fstype);
 		else
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
 		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
 			return (EPERM);
 	}
 
 	/*
 	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, fspath, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if ((fsflags & MNT_UPDATE) == 0) {
 		pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 		strcpy(pathbuf, fspath);
 		error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN);
 		/* debug.disablefullpath == 1 results in ENODEV */
 		if (error == 0 || error == ENODEV) {
 			error = vfs_domount_first(td, vfsp, pathbuf, vp,
 			    fsflags, optlist);
 		}
 		free(pathbuf, M_TEMP);
 	} else
 		error = vfs_domount_update(td, vp, fsflags, optlist);
 
 	return (error);
 }
 
 /*
  * Unmount a filesystem.
  *
  * Note: unmount takes a path to the vnode mounted on as argument, not
  * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 sys_unmount(struct thread *td, struct unmount_args *uap)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	char *pathbuf;
 	int error, id0, id1;
 
 	AUDIT_ARG_VALUE(uap->flags);
 	if (jailed(td->td_ucred) || usermount == 0) {
 		error = priv_check(td, PRIV_VFS_UNMOUNT);
 		if (error)
 			return (error);
 	}
 
 	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
 	if (error) {
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
 	if (uap->flags & MNT_BYFSID) {
 		AUDIT_ARG_TEXT(pathbuf);
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
 			    mp->mnt_stat.f_fsid.val[1] == id1) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
 		/*
 		 * Try to find global path for path argument.
 		 */
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 		    UIO_SYSSPACE, pathbuf, td);
 		if (namei(&nd) == 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
 			    MNAMELEN);
 			if (error == 0 || error == ENODEV)
 				vput(nd.ni_vp);
 		}
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
 				vfs_ref(mp);
 				break;
 			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
 	free(pathbuf, M_TEMP);
 	if (mp == NULL) {
 		/*
 		 * Previously we returned ENOENT for a nonexistent path and
 		 * EINVAL for a non-mountpoint.  We cannot tell these apart
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	error = dounmount(mp, uap->flags, td);
 	return (error);
 }
 
 /*
  * Return error if any of the vnodes, ignoring the root vnode
  * and the syncer vnode, have non-zero usecount.
  *
  * This function is purely advisory - it can return false positives
  * and negatives.
  */
 static int
 vfs_check_usecounts(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
 
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON &&
 		    vp->v_usecount != 0) {
 			VI_UNLOCK(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			return (EBUSY);
 		}
 		VI_UNLOCK(vp);
 	}
 
 	return (0);
 }
 
 static void
 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags)
 {
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 	mp->mnt_kern_flag &= ~mntkflags;
 	if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
 		wakeup(mp);
 	}
 	MNT_IUNLOCK(mp);
 	if (coveredvp != NULL) {
 		VOP_UNLOCK(coveredvp, 0);
 		vdrop(coveredvp);
 	}
 	vn_finished_write(mp);
 }
 
 /*
  * Do the actual filesystem unmount.
  */
 int
 dounmount(struct mount *mp, int flags, struct thread *td)
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	uint64_t async_flag;
 	int mnt_gen_r;
 
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
 		 */
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp, 0);
 			vdrop(coveredvp);
 			vfs_rel(mp);
 			return (EBUSY);
 		}
 	}
 
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
 	if (error != 0) {
 		if (coveredvp != NULL) {
 			VOP_UNLOCK(coveredvp, 0);
 			vdrop(coveredvp);
 		}
 		vfs_rel(mp);
 		return (error);
 	}
 
 	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
 	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
 	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		dounmount_cleanup(mp, coveredvp, 0);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
 	if (flags & MNT_NONBUSY) {
 		MNT_IUNLOCK(mp);
 		error = vfs_check_usecounts(mp);
 		MNT_ILOCK(mp);
 		if (error != 0) {
 			dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT |
 			    MNTK_NOINSMNTQ);
 			return (error);
 		}
 	}
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE) {
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
 		MNT_IUNLOCK(mp);
 		/*
 		 * Must be done after setting MNTK_UNMOUNTF and before
 		 * waiting for mnt_lockref to become 0.
 		 */
 		VFS_PURGE(mp);
 		MNT_ILOCK(mp);
 	}
 	error = 0;
 	if (mp->mnt_lockref) {
 		mp->mnt_kern_flag |= MNTK_DRAINING;
 		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
 		    "mount drain", 0);
 	}
 	MNT_IUNLOCK(mp);
 	KASSERT(mp->mnt_lockref == 0,
 	    ("%s: invalid lock refcount in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 	KASSERT(error == 0,
 	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
 	    __func__, __FILE__, __LINE__));
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	/*
 	 * From now, we can claim that the use reference on the
 	 * coveredvp is ours, and the ref can be released only by
 	 * successfull unmount by us, or left for later unmount
 	 * attempt.  The previously acquired hold reference is no
 	 * longer needed to protect the vnode from reuse.
 	 */
 	if (coveredvp != NULL)
 		vdrop(coveredvp);
 
 	vfs_msync(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
-	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	cache_purgevfs(mp, false); /* remove cache entries for this file sys */
 	vfs_deallocate_syncvnode(mp);
 	/*
 	 * For forced unmounts, move process cdir/rdir refs on the fs root
 	 * vnode to the covered vnode.  For non-forced unmounts we want
 	 * such references to cause an EBUSY error.
 	 */
 	if ((flags & MNT_FORCE) &&
 	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
 		if (mp->mnt_vnodecovered != NULL &&
 		    (mp->mnt_flag & MNT_IGNORE) == 0)
 			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 ||
 	    (error = VFS_SYNC(mp, MNT_WAIT)) == 0)
 		error = VFS_UNMOUNT(mp, flags);
 	vn_finished_write(mp);
 	/*
 	 * If we failed to flush the dirty blocks for this mount point,
 	 * undo all the cdir/rdir and rootvnode changes we made above.
 	 * Unless we failed to do so because the device is reporting that
 	 * it doesn't exist anymore.
 	 */
 	if (error && error != ENXIO) {
 		if ((flags & MNT_FORCE) &&
 		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
 			if (mp->mnt_vnodecovered != NULL &&
 			    (mp->mnt_flag & MNT_IGNORE) == 0)
 				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		MNT_ILOCK(mp);
 		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			MNT_IUNLOCK(mp);
 			vfs_allocate_syncvnode(mp);
 			MNT_ILOCK(mp);
 		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
 		if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
 		    (mp->mnt_kern_flag & MNTK_NOASYNC) == 0)
 			mp->mnt_kern_flag |= MNTK_ASYNC;
 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
 			wakeup(mp);
 		}
 		MNT_IUNLOCK(mp);
 		if (coveredvp)
 			VOP_UNLOCK(coveredvp, 0);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 	EVENTHANDLER_INVOKE(vfs_unmounted, mp, td);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
 		vput(coveredvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
 	if (mp == rootdevmp)
 		rootdevmp = NULL;
 	vfs_mount_destroy(mp);
 	return (0);
 }
 
 /*
  * Report errors during filesystem mounting.
  */
 void
 vfs_mount_error(struct mount *mp, const char *fmt, ...)
 {
 	struct vfsoptlist *moptlist = mp->mnt_optnew;
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 void
 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
 {
 	va_list ap;
 	int error, len;
 	char *errmsg;
 
 	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
 	if (error || errmsg == NULL || len <= 0)
 		return;
 
 	va_start(ap, fmt);
 	vsnprintf(errmsg, (size_t)len, fmt, ap);
 	va_end(ap);
 }
 
 /*
  * ---------------------------------------------------------------------
  * Functions for querying mount options/arguments from filesystems.
  */
 
 /*
  * Check that no unknown options are given
  */
 int
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
 	char errmsg[255];
 	const char **t, *p, *q;
 	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
 		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
 			q = p + 2;
 		for(t = global_opts; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		for(t = legal; *t != NULL; t++) {
 			if (strcmp(*t, p) == 0)
 				break;
 			if (q != NULL) {
 				if (strcmp(*t, q) == 0)
 					break;
 			}
 		}
 		if (*t != NULL)
 			continue;
 		snprintf(errmsg, sizeof(errmsg),
 		    "mount option <%s> is unknown", p);
 		ret = EINVAL;
 	}
 	if (ret != 0) {
 		TAILQ_FOREACH(opt, opts, link) {
 			if (strcmp(opt->name, "errmsg") == 0) {
 				strncpy((char *)opt->value, errmsg, opt->len);
 				break;
 			}
 		}
 		if (opt == NULL)
 			printf("%s\n", errmsg);
 	}
 	return (ret);
 }
 
 /*
  * Get a mount option by its name.
  *
  * Return 0 if the option was found, ENOENT otherwise.
  * If len is non-NULL it will be filled with the length
  * of the option. If buf is non-NULL, it will be filled
  * with the address of the option.
  */
 int
 vfs_getopt(opts, name, buf, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void **buf;
 	int *len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != NULL)
 				*len = opt->len;
 			if (buf != NULL)
 				*buf = opt->value;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
 {
 	struct vfsopt *opt;
 
 	if (opts == NULL)
 		return (-1);
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			return (opt->pos);
 		}
 	}
 	return (-1);
 }
 
 int
 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
 {
 	char *opt_value, *vtp;
 	quad_t iv;
 	int error, opt_len;
 
 	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
 	if (error != 0)
 		return (error);
 	if (opt_len == 0 || opt_value == NULL)
 		return (EINVAL);
 	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
 		return (EINVAL);
 	iv = strtoq(opt_value, &vtp, 0);
 	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (EINVAL);
 	if (iv < 0)
 		return (EINVAL);
 	switch (vtp[0]) {
 	case 't':
 	case 'T':
 		iv *= 1024;
 	case 'g':
 	case 'G':
 		iv *= 1024;
 	case 'm':
 	case 'M':
 		iv *= 1024;
 	case 'k':
 	case 'K':
 		iv *= 1024;
 	case '\0':
 		break;
 	default:
 		return (EINVAL);
 	}
 	*value = iv;
 
 	return (0);
 }
 
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
 	struct vfsopt *opt;
 
 	*error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 ||
 		    ((char *)opt->value)[opt->len - 1] != '\0') {
 			*error = EINVAL;
 			return (NULL);
 		}
 		return (opt->value);
 	}
 	*error = ENOENT;
 	return (NULL);
 }
 
 int
 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w,
 	uint64_t val)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (w != NULL)
 				*w |= val;
 			return (1);
 		}
 	}
 	if (w != NULL)
 		*w &= ~val;
 	return (0);
 }
 
 int
 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct vfsopt *opt;
 	int ret;
 
 	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->len == 0 || opt->value == NULL)
 			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
 		ret = vsscanf(opt->value, fmt, ap);
 		va_end(ap);
 		return (ret);
 	}
 	return (0);
 }
 
 int
 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len != len)
 				return (EINVAL);
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = len;
 		else {
 			if (opt->len < len)
 				return (EINVAL);
 			opt->len = len;
 			bcopy(value, opt->value, len);
 		}
 		return (0);
 	}
 	return (ENOENT);
 }
 
 int
 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
 {
 	struct vfsopt *opt;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
 		opt->seen = 1;
 		if (opt->value == NULL)
 			opt->len = strlen(value) + 1;
 		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
 			return (EINVAL);
 		return (0);
 	}
 	return (ENOENT);
 }
 
 /*
  * Find and copy a mount option.
  *
  * The size of the buffer has to be specified
  * in len, if it is not the same length as the
  * mount option, EINVAL is returned.
  * Returns ENOENT if the option is not found.
  */
 int
 vfs_copyopt(opts, name, dest, len)
 	struct vfsoptlist *opts;
 	const char *name;
 	void *dest;
 	int len;
 {
 	struct vfsopt *opt;
 
 	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
 
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) == 0) {
 			opt->seen = 1;
 			if (len != opt->len)
 				return (EINVAL);
 			bcopy(opt->value, dest, opt->len);
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	int error;
 
 	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
 	if (sbp != &mp->mnt_stat)
 		*sbp = mp->mnt_stat;
 	return (error);
 }
 
 void
 vfs_mountedfrom(struct mount *mp, const char *from)
 {
 
 	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
 	strlcpy(mp->mnt_stat.f_mntfromname, from,
 	    sizeof mp->mnt_stat.f_mntfromname);
 }
 
 /*
  * ---------------------------------------------------------------------
  * This is the api for building mount args and mounting filesystems from
  * inside the kernel.
  *
  * The API works by accumulation of individual args.  First error is
  * latched.
  *
  * XXX: should be documented in new manpage kernel_mount(9)
  */
 
 /* A memory allocation which must be freed when we are done */
 struct mntaarg {
 	SLIST_ENTRY(mntaarg)	next;
 };
 
 /* The header for the mount arguments */
 struct mntarg {
 	struct iovec *v;
 	int len;
 	int error;
 	SLIST_HEAD(, mntaarg)	list;
 };
 
 /*
  * Add a boolean argument.
  *
  * flag is the boolean value.
  * name must start with "no".
  */
 struct mntarg *
 mount_argb(struct mntarg *ma, int flag, const char *name)
 {
 
 	KASSERT(name[0] == 'n' && name[1] == 'o',
 	    ("mount_argb(...,%s): name must start with 'no'", name));
 
 	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
 }
 
 /*
  * Add an argument printf style
  */
 struct mntarg *
 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
 {
 	va_list ap;
 	struct mntaarg *maa;
 	struct sbuf *sb;
 	int len;
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	sb = sbuf_new_auto();
 	va_start(ap, fmt);
 	sbuf_vprintf(sb, fmt, ap);
 	va_end(ap);
 	sbuf_finish(sb);
 	len = sbuf_len(sb) + 1;
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	bcopy(sbuf_data(sb), maa + 1, len);
 	sbuf_delete(sb);
 
 	ma->v[ma->len].iov_base = maa + 1;
 	ma->v[ma->len].iov_len = len;
 	ma->len++;
 
 	return (ma);
 }
 
 /*
  * Add an argument which is a userland string.
  */
 struct mntarg *
 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
 {
 	struct mntaarg *maa;
 	char *tbuf;
 
 	if (val == NULL)
 		return (ma);
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
 	SLIST_INSERT_HEAD(&ma->list, maa, next);
 	tbuf = (void *)(maa + 1);
 	ma->error = copyinstr(val, tbuf, len, NULL);
 	return (mount_arg(ma, name, tbuf, -1));
 }
 
 /*
  * Plain argument.
  *
  * If length is -1, treat value as a C string.
  */
 struct mntarg *
 mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
 {
 
 	if (ma == NULL) {
 		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
 		SLIST_INIT(&ma->list);
 	}
 	if (ma->error)
 		return (ma);
 
 	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
 	    M_MOUNT, M_WAITOK);
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
 	ma->v[ma->len].iov_len = strlen(name) + 1;
 	ma->len++;
 
 	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
 	if (len < 0)
 		ma->v[ma->len].iov_len = strlen(val) + 1;
 	else
 		ma->v[ma->len].iov_len = len;
 	ma->len++;
 	return (ma);
 }
 
 /*
  * Free a mntarg structure
  */
 static void
 free_mntarg(struct mntarg *ma)
 {
 	struct mntaarg *maa;
 
 	while (!SLIST_EMPTY(&ma->list)) {
 		maa = SLIST_FIRST(&ma->list);
 		SLIST_REMOVE_HEAD(&ma->list, next);
 		free(maa, M_MOUNT);
 	}
 	free(ma->v, M_MOUNT);
 	free(ma, M_MOUNT);
 }
 
 /*
  * Mount a filesystem
  */
 int
 kernel_mount(struct mntarg *ma, uint64_t flags)
 {
 	struct uio auio;
 	int error;
 
 	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
 	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
 	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
 
 	auio.uio_iov = ma->v;
 	auio.uio_iovcnt = ma->len;
 	auio.uio_segflg = UIO_SYSSPACE;
 
 	error = ma->error;
 	if (!error)
 		error = vfs_donmount(curthread, flags, &auio);
 	free_mntarg(ma);
 	return (error);
 }
 
 /*
  * A printflike function to mount a filesystem.
  */
 int
 kernel_vmount(int flags, ...)
 {
 	struct mntarg *ma = NULL;
 	va_list ap;
 	const char *cp;
 	const void *vp;
 	int error;
 
 	va_start(ap, flags);
 	for (;;) {
 		cp = va_arg(ap, const char *);
 		if (cp == NULL)
 			break;
 		vp = va_arg(ap, const void *);
 		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
 	}
 	va_end(ap);
 
 	error = kernel_mount(ma, flags);
 	return (error);
 }
 
 void
 vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
 {
 
 	bcopy(oexp, exp, sizeof(*oexp));
 	exp->ex_numsecflavors = 0;
 }
Index: stable/11/sys/kern/vfs_mountroot.c
===================================================================
--- stable/11/sys/kern/vfs_mountroot.c	(revision 310958)
+++ stable/11/sys/kern/vfs_mountroot.c	(revision 310959)
@@ -1,1105 +1,1105 @@
 /*-
  * Copyright (c) 2010 Marcel Moolenaar
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
  * Copyright (c) 1989, 1993
  *      The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_rootdevname.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mdioctl.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 /*
  * The root filesystem is detailed in the kernel environment variable
  * vfs.root.mountfrom, which is expected to be in the general format
  *
  * <vfsname>:[<path>][	<vfsname>:[<path>] ...]
  * vfsname   := the name of a VFS known to the kernel and capable
  *              of being mounted as root
  * path      := disk device name or other data used by the filesystem
  *              to locate its physical store
  *
  * If the environment variable vfs.root.mountfrom is a space separated list,
  * each list element is tried in turn and the root filesystem will be mounted
  * from the first one that succeeds.
  *
  * The environment variable vfs.root.mountfrom.options is a comma delimited
  * set of string mount options.  These mount options must be parseable
  * by nmount() in the kernel.
  */
 
 static int parse_mount(char **);
 static struct mntarg *parse_mountroot_options(struct mntarg *, const char *);
 static int sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS);
 static void vfs_mountroot_wait(void);
 static int vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev);
 
 /*
  * The vnode of the system's root (/ in the filesystem, without chroot
  * active.)
  */
 struct vnode *rootvnode;
 
 /*
  * Mount of the system's /dev.
  */
 struct mount *rootdevmp;
 
 char *rootdevnames[2] = {NULL, NULL};
 
 struct mtx root_holds_mtx;
 MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
 
 struct root_hold_token {
 	const char			*who;
 	LIST_ENTRY(root_hold_token)	list;
 };
 
 static LIST_HEAD(, root_hold_token)	root_holds =
     LIST_HEAD_INITIALIZER(root_holds);
 
 enum action {
 	A_CONTINUE,
 	A_PANIC,
 	A_REBOOT,
 	A_RETRY
 };
 
 static enum action root_mount_onfail = A_CONTINUE;
 
 static int root_mount_mddev;
 static int root_mount_complete;
 
 /* By default wait up to 3 seconds for devices to appear. */
 static int root_mount_timeout = 3;
 TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
 
 SYSCTL_PROC(_vfs, OID_AUTO, root_mount_hold,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_vfs_root_mount_hold, "A",
     "List of root mount hold tokens");
 
 static int
 sysctl_vfs_root_mount_hold(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct root_hold_token *h;
 	int error;
 
 	sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL);
 
 	mtx_lock(&root_holds_mtx);
 	LIST_FOREACH(h, &root_holds, list) {
 		if (h != LIST_FIRST(&root_holds))
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s", h->who);
 	}
 	mtx_unlock(&root_holds_mtx);
 
 	error = sbuf_finish(&sb);
 	if (error == 0)
 		error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb));
 	sbuf_delete(&sb);
 	return (error);
 }
 
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
 	struct root_hold_token *h;
 
 	if (root_mounted())
 		return (NULL);
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->who = identifier;
 	mtx_lock(&root_holds_mtx);
 	LIST_INSERT_HEAD(&root_holds, h, list);
 	mtx_unlock(&root_holds_mtx);
 	return (h);
 }
 
 void
 root_mount_rel(struct root_hold_token *h)
 {
 
 	if (h == NULL)
 		return;
 	mtx_lock(&root_holds_mtx);
 	LIST_REMOVE(h, list);
 	wakeup(&root_holds);
 	mtx_unlock(&root_holds_mtx);
 	free(h, M_DEVBUF);
 }
 
 int
 root_mounted(void)
 {
 
 	/* No mutex is acquired here because int stores are atomic. */
 	return (root_mount_complete);
 }
 
 static void
 set_rootvnode(void)
 {
 	struct proc *p;
 
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode))
 		panic("Cannot find root vnode");
 
 	VOP_UNLOCK(rootvnode, 0);
 
 	p = curthread->td_proc;
 	FILEDESC_XLOCK(p->p_fd);
 
 	if (p->p_fd->fd_cdir != NULL)
 		vrele(p->p_fd->fd_cdir);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(rootvnode);
 
 	if (p->p_fd->fd_rdir != NULL)
 		vrele(p->p_fd->fd_rdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(rootvnode);
 
 	FILEDESC_XUNLOCK(p->p_fd);
 }
 
 static int
 vfs_mountroot_devfs(struct thread *td, struct mount **mpp)
 {
 	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp;
 	int error;
 
 	*mpp = NULL;
 
 	if (rootdevmp != NULL) {
 		/*
 		 * Already have /dev; this happens during rerooting.
 		 */
 		error = vfs_busy(rootdevmp, 0);
 		if (error != 0)
 			return (error);
 		*mpp = rootdevmp;
 	} else {
 		vfsp = vfs_byname("devfs");
 		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
 		if (vfsp == NULL)
 			return (ENOENT);
 
 		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
 
 		error = VFS_MOUNT(mp);
 		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 		if (error)
 			return (error);
 
 		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
 		TAILQ_INIT(opts);
 		mp->mnt_opt = opts;
 
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 
 		*mpp = mp;
 		rootdevmp = mp;
 	}
 
 	set_rootvnode();
 
 	error = kern_symlinkat(td, "/", AT_FDCWD, "dev", UIO_SYSSPACE);
 	if (error)
 		printf("kern_symlink /dev -> / returns %d\n", error);
 
 	return (error);
 }
 
 static void
 vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
 {
 	struct nameidata nd;
 	struct mount *mporoot, *mpnroot;
 	struct vnode *vp, *vporoot, *vpdevfs;
 	char *fspath;
 	int error;
 
 	mpnroot = TAILQ_NEXT(mpdevfs, mnt_list);
 
 	/* Shuffle the mountlist. */
 	mtx_lock(&mountlist_mtx);
 	mporoot = TAILQ_FIRST(&mountlist);
 	TAILQ_REMOVE(&mountlist, mpdevfs, mnt_list);
 	if (mporoot != mpdevfs) {
 		TAILQ_REMOVE(&mountlist, mpnroot, mnt_list);
 		TAILQ_INSERT_HEAD(&mountlist, mpnroot, mnt_list);
 	}
 	TAILQ_INSERT_TAIL(&mountlist, mpdevfs, mnt_list);
 	mtx_unlock(&mountlist_mtx);
 
-	cache_purgevfs(mporoot);
+	cache_purgevfs(mporoot, true);
 	if (mporoot != mpdevfs)
-		cache_purgevfs(mpdevfs);
+		cache_purgevfs(mpdevfs, true);
 
 	VFS_ROOT(mporoot, LK_EXCLUSIVE, &vporoot);
 
 	VI_LOCK(vporoot);
 	vporoot->v_iflag &= ~VI_MOUNT;
 	VI_UNLOCK(vporoot);
 	vporoot->v_mountedhere = NULL;
 	mporoot->mnt_flag &= ~MNT_ROOTFS;
 	mporoot->mnt_vnodecovered = NULL;
 	vput(vporoot);
 
 	/* Set up the new rootvnode, and purge the cache */
 	mpnroot->mnt_vnodecovered = NULL;
 	set_rootvnode();
-	cache_purgevfs(rootvnode->v_mount);
+	cache_purgevfs(rootvnode->v_mount, true);
 
 	if (mporoot != mpdevfs) {
 		/* Remount old root under /.mount or /mnt */
 		fspath = "/.mount";
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    fspath, td);
 		error = namei(&nd);
 		if (error) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			fspath = "/mnt";
 			NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 			    fspath, td);
 			error = namei(&nd);
 		}
 		if (!error) {
 			vp = nd.ni_vp;
 			error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
 			if (!error)
 				error = vinvalbuf(vp, V_SAVE, 0, 0);
 			if (!error) {
 				cache_purge(vp);
 				mporoot->mnt_vnodecovered = vp;
 				vp->v_mountedhere = mporoot;
 				strlcpy(mporoot->mnt_stat.f_mntonname,
 				    fspath, MNAMELEN);
 				VOP_UNLOCK(vp, 0);
 			} else
 				vput(vp);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 
 		if (error)
 			printf("mountroot: unable to remount previous root "
 			    "under /.mount or /mnt (error %d)\n", error);
 	}
 
 	/* Remount devfs under /dev */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
 	error = namei(&nd);
 	if (!error) {
 		vp = nd.ni_vp;
 		error = (vp->v_type == VDIR) ? 0 : ENOTDIR;
 		if (!error)
 			error = vinvalbuf(vp, V_SAVE, 0, 0);
 		if (!error) {
 			vpdevfs = mpdevfs->mnt_vnodecovered;
 			if (vpdevfs != NULL) {
 				cache_purge(vpdevfs);
 				vpdevfs->v_mountedhere = NULL;
 				vrele(vpdevfs);
 			}
 			mpdevfs->mnt_vnodecovered = vp;
 			vp->v_mountedhere = mpdevfs;
 			VOP_UNLOCK(vp, 0);
 		} else
 			vput(vp);
 	}
 	if (error)
 		printf("mountroot: unable to remount devfs under /dev "
 		    "(error %d)\n", error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	if (mporoot == mpdevfs) {
 		vfs_unbusy(mpdevfs);
 		/* Unlink the no longer needed /dev/dev -> / symlink */
 		error = kern_unlinkat(td, AT_FDCWD, "/dev/dev",
 		    UIO_SYSSPACE, 0);
 		if (error)
 			printf("mountroot: unable to unlink /dev/dev "
 			    "(error %d)\n", error);
 	}
 }
 
 /*
  * Configuration parser.
  */
 
 /* Parser character classes. */
 #define	CC_WHITESPACE		-1
 #define	CC_NONWHITESPACE	-2
 
 /* Parse errors. */
 #define	PE_EOF			-1
 #define	PE_EOL			-2
 
 static __inline int
 parse_peek(char **conf)
 {
 
 	return (**conf);
 }
 
 static __inline void
 parse_poke(char **conf, int c)
 {
 
 	**conf = c;
 }
 
 static __inline void
 parse_advance(char **conf)
 {
 
 	(*conf)++;
 }
 
 static int
 parse_skipto(char **conf, int mc)
 {
 	int c, match;
 
 	while (1) {
 		c = parse_peek(conf);
 		if (c == 0)
 			return (PE_EOF);
 		switch (mc) {
 		case CC_WHITESPACE:
 			match = (c == ' ' || c == '\t' || c == '\n') ? 1 : 0;
 			break;
 		case CC_NONWHITESPACE:
 			if (c == '\n')
 				return (PE_EOL);
 			match = (c != ' ' && c != '\t') ? 1 : 0;
 			break;
 		default:
 			match = (c == mc) ? 1 : 0;
 			break;
 		}
 		if (match)
 			break;
 		parse_advance(conf);
 	}
 	return (0);
 }
 
 static int
 parse_token(char **conf, char **tok)
 {
 	char *p;
 	size_t len;
 	int error;
 
 	*tok = NULL;
 	error = parse_skipto(conf, CC_NONWHITESPACE);
 	if (error)
 		return (error);
 	p = *conf;
 	error = parse_skipto(conf, CC_WHITESPACE);
 	len = *conf - p;
 	*tok = malloc(len + 1, M_TEMP, M_WAITOK | M_ZERO);
 	bcopy(p, *tok, len);
 	return (0);
 }
 
 static void
 parse_dir_ask_printenv(const char *var)
 {
 	char *val;
 
 	val = kern_getenv(var);
 	if (val != NULL) {
 		printf("  %s=%s\n", var, val);
 		freeenv(val);
 	}
 }
 
 static int
 parse_dir_ask(char **conf)
 {
 	char name[80];
 	char *mnt;
 	int error;
 
 	vfs_mountroot_wait();
 
 	printf("\nLoader variables:\n");
 	parse_dir_ask_printenv("vfs.root.mountfrom");
 	parse_dir_ask_printenv("vfs.root.mountfrom.options");
 
 	printf("\nManual root filesystem specification:\n");
 	printf("  <fstype>:<device> [options]\n");
 	printf("      Mount <device> using filesystem <fstype>\n");
 	printf("      and with the specified (optional) option list.\n");
 	printf("\n");
 	printf("    eg. ufs:/dev/da0s1a\n");
 	printf("        zfs:tank\n");
 	printf("        cd9660:/dev/cd0 ro\n");
 	printf("          (which is equivalent to: ");
 	printf("mount -t cd9660 -o ro /dev/cd0 /)\n");
 	printf("\n");
 	printf("  ?               List valid disk boot devices\n");
 	printf("  .               Yield 1 second (for background tasks)\n");
 	printf("  <empty line>    Abort manual input\n");
 
 	do {
 		error = EINVAL;
 		printf("\nmountroot> ");
 		cngets(name, sizeof(name), GETS_ECHO);
 		if (name[0] == '\0')
 			break;
 		if (name[0] == '?' && name[1] == '\0') {
 			printf("\nList of GEOM managed disk devices:\n  ");
 			g_dev_print();
 			continue;
 		}
 		if (name[0] == '.' && name[1] == '\0') {
 			pause("rmask", hz);
 			continue;
 		}
 		mnt = name;
 		error = parse_mount(&mnt);
 		if (error == -1)
 			printf("Invalid file system specification.\n");
 	} while (error != 0);
 
 	return (error);
 }
 
 static int
 parse_dir_md(char **conf)
 {
 	struct stat sb;
 	struct thread *td;
 	struct md_ioctl *mdio;
 	char *path, *tok;
 	int error, fd, len;
 
 	td = curthread;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 
 	len = strlen(tok);
 	mdio = malloc(sizeof(*mdio) + len + 1, M_TEMP, M_WAITOK | M_ZERO);
 	path = (void *)(mdio + 1);
 	bcopy(tok, path, len);
 	free(tok, M_TEMP);
 
 	/* Get file status. */
 	error = kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &sb, NULL);
 	if (error)
 		goto out;
 
 	/* Open /dev/mdctl so that we can attach/detach. */
 	error = kern_openat(td, AT_FDCWD, "/dev/" MDCTL_NAME, UIO_SYSSPACE,
 	    O_RDWR, 0);
 	if (error)
 		goto out;
 
 	fd = td->td_retval[0];
 	mdio->md_version = MDIOVERSION;
 	mdio->md_type = MD_VNODE;
 
 	if (root_mount_mddev != -1) {
 		mdio->md_unit = root_mount_mddev;
 		DROP_GIANT();
 		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
 		PICKUP_GIANT();
 		/* Ignore errors. We don't care. */
 		root_mount_mddev = -1;
 	}
 
 	mdio->md_file = (void *)(mdio + 1);
 	mdio->md_options = MD_AUTOUNIT | MD_READONLY;
 	mdio->md_mediasize = sb.st_size;
 	mdio->md_unit = 0;
 	DROP_GIANT();
 	error = kern_ioctl(td, fd, MDIOCATTACH, (void *)mdio);
 	PICKUP_GIANT();
 	if (error)
 		goto out;
 
 	if (mdio->md_unit > 9) {
 		printf("rootmount: too many md units\n");
 		mdio->md_file = NULL;
 		mdio->md_options = 0;
 		mdio->md_mediasize = 0;
 		DROP_GIANT();
 		error = kern_ioctl(td, fd, MDIOCDETACH, (void *)mdio);
 		PICKUP_GIANT();
 		/* Ignore errors. We don't care. */
 		error = ERANGE;
 		goto out;
 	}
 
 	root_mount_mddev = mdio->md_unit;
 	printf(MD_NAME "%u attached to %s\n", root_mount_mddev, mdio->md_file);
 
 	error = kern_close(td, fd);
 
  out:
 	free(mdio, M_TEMP);
 	return (error);
 }
 
 static int
 parse_dir_onfail(char **conf)
 {
 	char *action;
 	int error;
 
 	error = parse_token(conf, &action);
 	if (error)
 		return (error);
 
 	if (!strcmp(action, "continue"))
 		root_mount_onfail = A_CONTINUE;
 	else if (!strcmp(action, "panic"))
 		root_mount_onfail = A_PANIC;
 	else if (!strcmp(action, "reboot"))
 		root_mount_onfail = A_REBOOT;
 	else if (!strcmp(action, "retry"))
 		root_mount_onfail = A_RETRY;
 	else {
 		printf("rootmount: %s: unknown action\n", action);
 		error = EINVAL;
 	}
 
 	free(action, M_TEMP);
 	return (0);
 }
 
 static int
 parse_dir_timeout(char **conf)
 {
 	char *tok, *endtok;
 	long secs;
 	int error;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 
 	secs = strtol(tok, &endtok, 0);
 	error = (secs < 0 || *endtok != '\0') ? EINVAL : 0;
 	if (!error)
 		root_mount_timeout = secs;
 	free(tok, M_TEMP);
 	return (error);
 }
 
 static int
 parse_directive(char **conf)
 {
 	char *dir;
 	int error;
 
 	error = parse_token(conf, &dir);
 	if (error)
 		return (error);
 
 	if (strcmp(dir, ".ask") == 0)
 		error = parse_dir_ask(conf);
 	else if (strcmp(dir, ".md") == 0)
 		error = parse_dir_md(conf);
 	else if (strcmp(dir, ".onfail") == 0)
 		error = parse_dir_onfail(conf);
 	else if (strcmp(dir, ".timeout") == 0)
 		error = parse_dir_timeout(conf);
 	else {
 		printf("mountroot: invalid directive `%s'\n", dir);
 		/* Ignore the rest of the line. */
 		(void)parse_skipto(conf, '\n');
 		error = EINVAL;
 	}
 	free(dir, M_TEMP);
 	return (error);
 }
 
 static int
 parse_mount_dev_present(const char *dev)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, dev, curthread);
 	error = namei(&nd);
 	if (!error)
 		vput(nd.ni_vp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error != 0) ? 0 : 1;
 }
 
 #define	ERRMSGL	255
 static int
 parse_mount(char **conf)
 {
 	char *errmsg;
 	struct mntarg *ma;
 	char *dev, *fs, *opts, *tok;
 	int error;
 
 	error = parse_token(conf, &tok);
 	if (error)
 		return (error);
 	fs = tok;
 	error = parse_skipto(&tok, ':');
 	if (error) {
 		free(fs, M_TEMP);
 		return (error);
 	}
 	parse_poke(&tok, '\0');
 	parse_advance(&tok);
 	dev = tok;
 
 	if (root_mount_mddev != -1) {
 		/* Handle substitution for the md unit number. */
 		tok = strstr(dev, "md#");
 		if (tok != NULL)
 			tok[2] = '0' + root_mount_mddev;
 	}
 
 	/* Parse options. */
 	error = parse_token(conf, &tok);
 	opts = (error == 0) ? tok : NULL;
 
 	printf("Trying to mount root from %s:%s [%s]...\n", fs, dev,
 	    (opts != NULL) ? opts : "");
 
 	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
 
 	if (vfs_byname(fs) == NULL) {
 		strlcpy(errmsg, "unknown file system", ERRMSGL);
 		error = ENOENT;
 		goto out;
 	}
 
 	error = vfs_mountroot_wait_if_neccessary(fs, dev);
 	if (error != 0)
 		goto out;
 
 	ma = NULL;
 	ma = mount_arg(ma, "fstype", fs, -1);
 	ma = mount_arg(ma, "fspath", "/", -1);
 	ma = mount_arg(ma, "from", dev, -1);
 	ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
 	ma = mount_arg(ma, "ro", NULL, 0);
 	ma = parse_mountroot_options(ma, opts);
 	error = kernel_mount(ma, MNT_ROOTFS);
 
  out:
 	if (error) {
 		printf("Mounting from %s:%s failed with error %d",
 		    fs, dev, error);
 		if (errmsg[0] != '\0')
 			printf(": %s", errmsg);
 		printf(".\n");
 	}
 	free(fs, M_TEMP);
 	free(errmsg, M_TEMP);
 	if (opts != NULL)
 		free(opts, M_TEMP);
 	/* kernel_mount can return -1 on error. */
 	return ((error < 0) ? EDOOFUS : error);
 }
 #undef ERRMSGL
 
 static int
 vfs_mountroot_parse(struct sbuf *sb, struct mount *mpdevfs)
 {
 	struct mount *mp;
 	char *conf;
 	int error;
 
 	root_mount_mddev = -1;
 
 retry:
 	conf = sbuf_data(sb);
 	mp = TAILQ_NEXT(mpdevfs, mnt_list);
 	error = (mp == NULL) ? 0 : EDOOFUS;
 	root_mount_onfail = A_CONTINUE;
 	while (mp == NULL) {
 		error = parse_skipto(&conf, CC_NONWHITESPACE);
 		if (error == PE_EOL) {
 			parse_advance(&conf);
 			continue;
 		}
 		if (error < 0)
 			break;
 		switch (parse_peek(&conf)) {
 		case '#':
 			error = parse_skipto(&conf, '\n');
 			break;
 		case '.':
 			error = parse_directive(&conf);
 			break;
 		default:
 			error = parse_mount(&conf);
 			if (error == -1) {
 				printf("mountroot: invalid file system "
 				    "specification.\n");
 				error = 0;
 			}
 			break;
 		}
 		if (error < 0)
 			break;
 		/* Ignore any trailing garbage on the line. */
 		if (parse_peek(&conf) != '\n') {
 			printf("mountroot: advancing to next directive...\n");
 			(void)parse_skipto(&conf, '\n');
 		}
 		mp = TAILQ_NEXT(mpdevfs, mnt_list);
 	}
 	if (mp != NULL)
 		return (0);
 
 	/*
 	 * We failed to mount (a new) root.
 	 */
 	switch (root_mount_onfail) {
 	case A_CONTINUE:
 		break;
 	case A_PANIC:
 		panic("mountroot: unable to (re-)mount root.");
 		/* NOTREACHED */
 	case A_RETRY:
 		goto retry;
 	case A_REBOOT:
 		kern_reboot(RB_NOSYNC);
 		/* NOTREACHED */
 	}
 
 	return (error);
 }
 
 static void
 vfs_mountroot_conf0(struct sbuf *sb)
 {
 	char *s, *tok, *mnt, *opt;
 	int error;
 
 	sbuf_printf(sb, ".onfail panic\n");
 	sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
 	if (boothowto & RB_ASKNAME)
 		sbuf_printf(sb, ".ask\n");
 #ifdef ROOTDEVNAME
 	if (boothowto & RB_DFLTROOT)
 		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
 #endif
 	if (boothowto & RB_CDROM) {
 		sbuf_printf(sb, "cd9660:/dev/cd0 ro\n");
 		sbuf_printf(sb, ".timeout 0\n");
 		sbuf_printf(sb, "cd9660:/dev/cd1 ro\n");
 		sbuf_printf(sb, ".timeout %d\n", root_mount_timeout);
 	}
 	s = kern_getenv("vfs.root.mountfrom");
 	if (s != NULL) {
 		opt = kern_getenv("vfs.root.mountfrom.options");
 		tok = s;
 		error = parse_token(&tok, &mnt);
 		while (!error) {
 			sbuf_printf(sb, "%s %s\n", mnt,
 			    (opt != NULL) ? opt : "");
 			free(mnt, M_TEMP);
 			error = parse_token(&tok, &mnt);
 		}
 		if (opt != NULL)
 			freeenv(opt);
 		freeenv(s);
 	}
 	if (rootdevnames[0] != NULL)
 		sbuf_printf(sb, "%s\n", rootdevnames[0]);
 	if (rootdevnames[1] != NULL)
 		sbuf_printf(sb, "%s\n", rootdevnames[1]);
 #ifdef ROOTDEVNAME
 	if (!(boothowto & RB_DFLTROOT))
 		sbuf_printf(sb, "%s\n", ROOTDEVNAME);
 #endif
 	if (!(boothowto & RB_ASKNAME))
 		sbuf_printf(sb, ".ask\n");
 }
 
 static int
 vfs_mountroot_readconf(struct thread *td, struct sbuf *sb)
 {
 	static char buf[128];
 	struct nameidata nd;
 	off_t ofs;
 	ssize_t resid;
 	int error, flags, len;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	ofs = 0;
 	len = sizeof(buf) - 1;
 	while (1) {
 		error = vn_rdwr(UIO_READ, nd.ni_vp, buf, len, ofs,
 		    UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred,
 		    NOCRED, &resid, td);
 		if (error)
 			break;
 		if (resid == len)
 			break;
 		buf[len - resid] = 0;
 		sbuf_printf(sb, "%s", buf);
 		ofs += len - resid;
 	}
 
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
 	return (error);
 }
 
 static void
 vfs_mountroot_wait(void)
 {
 	struct root_hold_token *h;
 	struct timeval lastfail;
 	int curfail;
 
 	curfail = 0;
 	while (1) {
 		DROP_GIANT();
 		g_waitidle();
 		PICKUP_GIANT();
 		mtx_lock(&root_holds_mtx);
 		if (LIST_EMPTY(&root_holds)) {
 			mtx_unlock(&root_holds_mtx);
 			break;
 		}
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("Root mount waiting for:");
 			LIST_FOREACH(h, &root_holds, list)
 				printf(" %s", h->who);
 			printf("\n");
 		}
 		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
 		    hz);
 	}
 }
 
 static int
 vfs_mountroot_wait_if_neccessary(const char *fs, const char *dev)
 {
 	int delay, timeout;
 
 	/*
 	 * In case of ZFS and NFS we don't have a way to wait for
 	 * specific device.
 	 */
 	if (strcmp(fs, "zfs") == 0 || strstr(fs, "nfs") != NULL ||
 	    dev[0] == '\0') {
 		vfs_mountroot_wait();
 		return (0);
 	}
 
 	/*
 	 * Otherwise, no point in waiting if the device is already there.
 	 * Note that we must wait for GEOM to finish reconfiguring itself,
 	 * eg for geom_part(4) to finish tasting.
 	 */
 	DROP_GIANT();
 	g_waitidle();
 	PICKUP_GIANT();
 	if (parse_mount_dev_present(dev))
 		return (0);
 
 	/*
 	 * No luck.  Let's wait.  This code looks weird, but it's that way
 	 * to behave exactly as it used to work before.
 	 */
 	vfs_mountroot_wait();
 	printf("mountroot: waiting for device %s...\n", dev);
 	delay = hz / 10;
 	timeout = root_mount_timeout * hz;
 	do {
 		pause("rmdev", delay);
 		timeout -= delay;
 	} while (timeout > 0 && !parse_mount_dev_present(dev));
 
 	if (timeout <= 0)
 		return (ENODEV);
 
 	return (0);
 }
 
 void
 vfs_mountroot(void)
 {
 	struct mount *mp;
 	struct sbuf *sb;
 	struct thread *td;
 	time_t timebase;
 	int error;
 
 	td = curthread;
 
 	sb = sbuf_new_auto();
 	vfs_mountroot_conf0(sb);
 	sbuf_finish(sb);
 
 	error = vfs_mountroot_devfs(td, &mp);
 	while (!error) {
 		error = vfs_mountroot_parse(sb, mp);
 		if (!error) {
 			vfs_mountroot_shuffle(td, mp);
 			sbuf_clear(sb);
 			error = vfs_mountroot_readconf(td, sb);
 			sbuf_finish(sb);
 		}
 	}
 
 	sbuf_delete(sb);
 
 	/*
 	 * Iterate over all currently mounted file systems and use
 	 * the time stamp found to check and/or initialize the RTC.
 	 * Call inittodr() only once and pass it the largest of the
 	 * timestamps we encounter.
 	 */
 	timebase = 0;
 	mtx_lock(&mountlist_mtx);
 	mp = TAILQ_FIRST(&mountlist);
 	while (mp != NULL) {
 		if (mp->mnt_time > timebase)
 			timebase = mp->mnt_time;
 		mp = TAILQ_NEXT(mp, mnt_list);
 	}
 	mtx_unlock(&mountlist_mtx);
 	inittodr(timebase);
 
 	/* Keep prison0's root in sync with the global rootvnode. */
 	mtx_lock(&prison0.pr_mtx);
 	prison0.pr_root = rootvnode;
 	vref(prison0.pr_root);
 	mtx_unlock(&prison0.pr_mtx);
 
 	mtx_lock(&root_holds_mtx);
 	atomic_store_rel_int(&root_mount_complete, 1);
 	wakeup(&root_mount_complete);
 	mtx_unlock(&root_holds_mtx);
 
 	EVENTHANDLER_INVOKE(mountroot);
 }
 
 static struct mntarg *
 parse_mountroot_options(struct mntarg *ma, const char *options)
 {
 	char *p;
 	char *name, *name_arg;
 	char *val, *val_arg;
 	char *opts;
 
 	if (options == NULL || options[0] == '\0')
 		return (ma);
 
 	p = opts = strdup(options, M_MOUNT);
 	if (opts == NULL) {
 		return (ma);
 	}
 
 	while((name = strsep(&p, ",")) != NULL) {
 		if (name[0] == '\0')
 			break;
 
 		val = strchr(name, '=');
 		if (val != NULL) {
 			*val = '\0';
 			++val;
 		}
 		if( strcmp(name, "rw") == 0 ||
 		    strcmp(name, "noro") == 0) {
 			/*
 			 * The first time we mount the root file system,
 			 * we need to mount 'ro', so We need to ignore
 			 * 'rw' and 'noro' mount options.
 			 */
 			continue;
 		}
 		name_arg = strdup(name, M_MOUNT);
 		val_arg = NULL;
 		if (val != NULL)
 			val_arg = strdup(val, M_MOUNT);
 
 		ma = mount_arg(ma, name_arg, val_arg,
 		    (val_arg != NULL ? -1 : 0));
 	}
 	free(opts, M_MOUNT);
 	return (ma);
 }
Index: stable/11/sys/sys/vnode.h
===================================================================
--- stable/11/sys/sys/vnode.h	(revision 310958)
+++ stable/11/sys/sys/vnode.h	(revision 310959)
@@ -1,899 +1,899 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	f - freelist mutex
  *	i - interlock
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 * See #defines below for renaming to v_* namespace.
 	 */
 	union {
 		struct mount	*vu_mount;	/* v ptr to mountpoint (VDIR) */
 		struct socket	*vu_socket;	/* v unix domain net (VSOCK) */
 		struct cdev	*vu_cdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* v fifo (VFIFO) */
 	} v_un;
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_actfreelist;	/* f vnode active/free lists */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
 	u_int	v_hash;
 	enum	vtype v_type;			/* u vnode type */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	v_mountedhere	v_un.vu_mount
 #define	v_socket	v_un.vu_socket
 #define	v_rdev		v_un.vu_cdev
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* socket, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VI_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_ACTIVE	0x0200	/* This vnode is on the active list */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	dev_t		va_fsid;	/* filesystem id */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 #if 0
 void	assert_vop_elocked_other(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_locked(struct vnode *vp, const char *str);
 #if 0
 voi0	assert_vop_slocked(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str))
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)	assert_vop_slocked((vp), (str))
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str)
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(int newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
-void	cache_purgevfs(struct mount *mp);
+void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(u_int count);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf,
 	    u_int *buflen);
 #define textvp_fullpath(p, rb, rfb) \
 	vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb)
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 #define	vdrop(vp)	_vdrop((vp), 0)
 #define	vdropl(vp)	_vdrop((vp), 1)
 void	_vdrop(struct vnode *, bool);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 #define	vhold(vp)	_vhold((vp), 0)
 #define	vholdl(vp)	_vhold((vp), 1)
 void	_vhold(struct vnode *, bool);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	_vn_lock(struct vnode *vp, int flags, char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_post(void *a, int rc);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_post(void *a, int rc);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_strategy_pre(void *a);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 #else
 #define	vop_strategy_pre(x)	do { } while (0)
 #define	vop_lock_pre(x)		do { } while (0)
 #define	vop_lock_post(x, y)	do { } while (0)
 #define	vop_unlock_post(x, y)	do { } while (0)
 #define	vop_unlock_pre(x)	do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefl(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(int newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct ucred *cred);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: stable/11
===================================================================
--- stable/11	(revision 310958)
+++ stable/11	(revision 310959)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r305378-305379,305386,305684,306224,306608,306803,307650,307685,308407,308665,308667,309067