Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 355789)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c	(revision 355790)
@@ -1,1360 +1,1363 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
  *
  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
  * However, vnodes within these mounted on file systems have their v_vfsp
  * fields set to the head filesystem to make NFS happy (see
  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
  * so that it cannot be freed until all snapshots have been unmounted.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
 #include <sys/zap.h>
 
 #include "zfs_namecheck.h"
 
 /* Common access mode for all virtual directories under the ctldir */
 const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
     S_IROTH | S_IXOTH;
 
 /*
  * "Synthetic" filesystem implementation.
  */
 
 /*
  * Assert that A implies B.
  */
 #define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
 
 static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
 
 typedef struct sfs_node {
 	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t	sn_parent_id;
 	uint64_t	sn_id;
 } sfs_node_t;
 
 /*
  * Check the parent's ID as well as the node's to account for a chance
  * that IDs originating from different domains (snapshot IDs, artifical
  * IDs, znode IDs) may clash.
  */
 static int
 sfs_compare_ids(struct vnode *vp, void *arg)
 {
 	sfs_node_t *n1 = vp->v_data;
 	sfs_node_t *n2 = arg;
 	bool equal;
 
 	equal = n1->sn_id == n2->sn_id &&
 	    n1->sn_parent_id == n2->sn_parent_id;
 
 	/* Zero means equality. */
 	return (!equal);
 }
 
 static int
 sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
    uint64_t id, struct vnode **vpp)
 {
 	sfs_node_t search;
 	int err;
 
 	search.sn_id = id;
 	search.sn_parent_id = parent_id;
 	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
 	    sfs_compare_ids, &search);
 	return (err);
 }
 
 static int
 sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
    uint64_t id, struct vnode **vpp)
 {
 	int err;
 
 	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
 	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
 	    sfs_compare_ids, vp->v_data);
 	return (err);
 }
 
 static void
 sfs_vnode_remove(struct vnode *vp)
 {
 	vfs_hash_remove(vp);
 }
 
 typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
 
 static int
 sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
     const char *tag, struct vop_vector *vops,
     sfs_vnode_setup_fn setup, void *arg,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
 	if (error != 0 || *vpp != NULL) {
 		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 		    "sfs vnode with no data");
 		return (error);
 	}
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode(tag, mp, vops, &vp);
 	if (error != 0) {
 		*vpp = NULL;
 		return (error);
 	}
 
 	/*
 	 * Exclusively lock the vnode vnode while it's being constructed.
 	 */
 	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		*vpp = NULL;
 		return (error);
 	}
 
 	setup(vp, arg);
 
 	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
 	if (error != 0 || *vpp != NULL) {
 		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
 		    "sfs vnode with no data");
 		return (error);
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 static void
 sfs_print_node(sfs_node_t *node)
 {
 	printf("\tname = %s\n", node->sn_name);
 	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
 	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
 }
 
 static sfs_node_t *
 sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
 {
 	struct sfs_node *node;
 
 	KASSERT(strlen(name) < sizeof(node->sn_name),
 	    ("sfs node name is too long"));
 	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
 	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
 	strlcpy(node->sn_name, name, sizeof(node->sn_name));
 	node->sn_parent_id = parent_id;
 	node->sn_id = id;
 
 	return (node);
 }
 
 static void
 sfs_destroy_node(sfs_node_t *node)
 {
 	free(node, M_SFSNODES);
 }
 
 static void *
 sfs_reclaim_vnode(vnode_t *vp)
 {
 	sfs_node_t *node;
 	void *data;
 
 	sfs_vnode_remove(vp);
 	data = vp->v_data;
 	vp->v_data = NULL;
 	return (data);
 }
 
 static int
 sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
     uio_t *uio, off_t *offp)
 {
 	struct dirent entry;
 	int error;
 
 	/* Reset ncookies for subsequent use of vfs_read_dirent. */
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	if (uio->uio_resid < sizeof(entry))
 		return (SET_ERROR(EINVAL));
 
 	if (uio->uio_offset < 0)
 		return (SET_ERROR(EINVAL));
 	if (uio->uio_offset == 0) {
 		entry.d_fileno = id;
 		entry.d_type = DT_DIR;
 		entry.d_name[0] = '.';
 		entry.d_namlen = 1;
 		entry.d_reclen = sizeof(entry);
 		dirent_terminate(&entry);
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0)
 			return (SET_ERROR(error));
 	}
 
 	if (uio->uio_offset < sizeof(entry))
 		return (SET_ERROR(EINVAL));
 	if (uio->uio_offset == sizeof(entry)) {
 		entry.d_fileno = parent_id;
 		entry.d_type = DT_DIR;
 		entry.d_name[0] = '.';
 		entry.d_name[1] = '.';
 		entry.d_namlen = 2;
 		entry.d_reclen = sizeof(entry);
 		dirent_terminate(&entry);
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0)
 			return (SET_ERROR(error));
 	}
 
 	if (offp != NULL)
 		*offp = 2 * sizeof(entry);
 	return (0);
 }
 
 
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 static struct vop_vector zfsctl_ops_root;
 static struct vop_vector zfsctl_ops_snapdir;
 static struct vop_vector zfsctl_ops_snapshot;
 static struct vop_vector zfsctl_ops_shares_dir;
 
 void
 zfsctl_init(void)
 {
 }
 
 void
 zfsctl_fini(void)
 {
 }
 
 boolean_t
 zfsctl_is_node(vnode_t *vp)
 {
 	return (vn_matchops(vp, zfsctl_ops_root) ||
 	    vn_matchops(vp, zfsctl_ops_snapdir) ||
 	    vn_matchops(vp, zfsctl_ops_snapshot) ||
 	    vn_matchops(vp, zfsctl_ops_shares_dir));
 
 }
 
 typedef struct zfsctl_root {
 	sfs_node_t	node;
 	sfs_node_t	*snapdir;
 	timestruc_t	cmtime;
 } zfsctl_root_t;
 
 
 /*
  * Create the '.zfs' directory.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	zfsctl_root_t *dot_zfs;
 	sfs_node_t *snapdir;
 	vnode_t *rvp;
 	uint64_t crtime[2];
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
 	    ZFSCTL_INO_SNAPDIR);
 	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
 	    ZFSCTL_INO_ROOT);
 	dot_zfs->snapdir = snapdir;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
 	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 	    &crtime, sizeof(crtime)));
 	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
 	vput(rvp);
 
 	zfsvfs->z_ctldir = dot_zfs;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * The nodes must not have any associated vnodes by now as they should be
  * vflush-ed.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
 	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 static int
 zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	return (VFS_ROOT(mp, flags, vpp));
 }
 
 static void
 zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
 {
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* We support shared locking. */
 	VN_LOCK_ASHARE(vp);
 	vp->v_type = VDIR;
 	vp->v_data = arg;
 }
 
 static int
 zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	void *node;
 	int err;
 
 	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
 	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
 	    zfsctl_common_vnode_setup, node, vpp);
 	return (err);
 }
 
 static int
 zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
     struct vnode **vpp)
 {
 	void *node;
 	int err;
 
 	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
 	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
 	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
 	return (err);
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 int
 zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
 {
 	vnode_t *vp;
 	int error;
 
 	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
 	return (error);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 static int
 zfsctl_common_open(struct vop_open_args *ap)
 {
 	int flags = ap->a_mode;
 
 	if (flags & FWRITE)
 		return (SET_ERROR(EACCES));
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 static int
 zfsctl_common_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	accmode_t accmode = ap->a_accmode;
 
 	if (accmode & VWRITE)
 		return (SET_ERROR(EACCES));
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	timestruc_t	now;
 	sfs_node_t *node;
 
 	node = vp->v_data;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vn_fsid(vp, vap);
 	vap->va_mode = zfsctl_ctldir_mode;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_flags = 0;
 
 	vap->va_nodeid = node->sn_id;
 
 	/* At least '.' and '..'. */
 	vap->va_nlink = 2;
 }
 
 static int
 zfsctl_common_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 	vnode_t		*vp = ap->a_vp;
 	fid_t		*fidp = (void *)ap->a_fid;
 	sfs_node_t	*node = vp->v_data;
 	uint64_t	object = node->sn_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	zfid = (zfid_short_t *)fidp;
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof(zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs nodes always have a generation number of 0 */
 	for (i = 0; i < sizeof(zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	return (0);
 }
 
 static int
 zfsctl_common_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	(void) sfs_reclaim_vnode(vp);
 	return (0);
 }
 
 static int
 zfsctl_common_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	sfs_print_node(ap->a_vp->v_data);
 	return (0);
 }
 
 /*
  * Get root directory attributes.
  */
 static int
 zfsctl_root_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	zfsctl_root_t *node = vp->v_data;
 
 	zfsctl_common_getattr(vp, vap);
 	vap->va_ctime = node->cmtime;
 	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
 	vap->va_nlink += 1; /* snapdir */
 	vap->va_size = vap->va_nlink;
 	return (0);
 }
 
 /*
  * When we lookup "." we still can be asked to lock it
  * differently, can't we?
  */
 int
 zfsctl_relock_dot(vnode_t *dvp, int ltype)
 {
 	vref(dvp);
 	if (ltype != VOP_ISLOCKED(dvp)) {
 		if (ltype == LK_EXCLUSIVE)
 			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 		else /* if (ltype == LK_SHARED) */
 			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 		/* Relock for the "." case may left us with reclaimed vnode. */
 		if (VN_IS_DOOMED(dvp)) {
 			vrele(dvp);
 			return (SET_ERROR(ENOENT));
 		}
 	}
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 int
 zfsctl_root_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	cred_t *cr = ap->a_cnp->cn_cred;
 	int flags = ap->a_cnp->cn_flags;
 	int lkflags = ap->a_cnp->cn_lkflags;
 	int nameiop = ap->a_cnp->cn_nameiop;
 	int err;
 	int ltype;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 		return (SET_ERROR(ENOTSUP));
 
 	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 		if (err == 0)
 			*vpp = dvp;
 	} else if ((flags & ISDOTDOT) != 0) {
 		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
 		    lkflags, vpp);
 	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
 		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfsctl_root_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct dirent entry;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_root_t *node = vp->v_data;
 	uio_t *uio = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	off_t dots_offset;
 	int error;
 
 	ASSERT(vp->v_type == VDIR);
 
 	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
 	    &dots_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG) /* ran out of destination space */
 			error = 0;
 		return (error);
 	}
 	if (uio->uio_offset != dots_offset)
 		return (SET_ERROR(EINVAL));
 
 	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
 	entry.d_fileno = node->snapdir->sn_id;
 	entry.d_type = DT_DIR;
 	strcpy(entry.d_name, node->snapdir->sn_name);
 	entry.d_namlen = strlen(entry.d_name);
 	entry.d_reclen = sizeof(entry);
 	dirent_terminate(&entry);
 	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG)
 			error = 0;
 		return (SET_ERROR(error));
 	}
 	if (eofp != NULL)
 		*eofp = 1;
 	return (0);
 }
 
 static int
 zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
 {
 	static const char dotzfs_name[4] = ".zfs";
 	vnode_t *dvp;
 	int error;
 
 	if (*ap->a_buflen < sizeof (dotzfs_name))
 		return (SET_ERROR(ENOMEM));
 
 	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
 	    LK_SHARED, &dvp);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	VOP_UNLOCK(dvp, 0);
 	*ap->a_vpp = dvp;
 	*ap->a_buflen -= sizeof (dotzfs_name);
 	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
 	return (0);
 }
 
 static int
 zfsctl_common_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	/*
 	 * We care about ACL variables so that user land utilities like ls
 	 * can display them correctly.  Since the ctldir's st_dev is set to be
 	 * the same as the parent dataset, we must support all variables that
 	 * it supports.
 	 */
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*ap->a_retval = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*ap->a_retval = ACL_MAX_ENTRIES;
 		return (0);
 
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /**
  * Returns a trivial ACL
  */
 int
 zfsctl_common_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t a_type;
 		struct acl *a_aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int i;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
 	/*
 	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
 	 * attributes.  That is not the case for the ctldir, so we must clear
 	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
 	 * aren't supported by the ctldir.
 	 */
 	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
 		struct acl_entry *entry;
 		entry = &(ap->a_aclp->acl_entry[i]);
 		uint32_t old_perm = entry->ae_perm;
 		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
 		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
 		    ACL_READ_NAMED_ATTRS );
 	}
 
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_root = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_ioctl =	VOP_EINVAL,
 	.vop_getattr =	zfsctl_root_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	zfsctl_root_readdir,
 	.vop_lookup =	zfsctl_root_lookup,
 	.vop_inactive =	VOP_NULL,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 	.vop_print =	zfsctl_common_print,
 	.vop_vptocnp =	zfsctl_root_vptocnp,
 	.vop_pathconf =	zfsctl_common_pathconf,
 	.vop_getacl =	zfsctl_common_getacl,
 };
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
 
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 	int err;
 
 	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
 	return (err);
 }
 
 /*
  * Given a vnode get a root vnode of a filesystem mounted on top of
  * the vnode, if any.  The root vnode is referenced and locked.
  * If no filesystem is mounted then the orinal vnode remains referenced
  * and locked.  If any error happens the orinal vnode is unlocked and
  * released.
  */
 static int
 zfsctl_mounted_here(vnode_t **vpp, int flags)
 {
 	struct mount *mp;
 	int err;
 
 	ASSERT_VOP_LOCKED(*vpp, __func__);
 	ASSERT3S((*vpp)->v_type, ==, VDIR);
 
 	if ((mp = (*vpp)->v_mountedhere) != NULL) {
 		err = vfs_busy(mp, 0);
 		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
 		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
 		vput(*vpp);
 		err = VFS_ROOT(mp, flags, vpp);
 		vfs_unbusy(mp);
 		return (err);
 	}
 	return (EJUSTRETURN);
 }
 
 typedef struct {
 	const char *snap_name;
 	uint64_t    snap_id;
 } snapshot_setup_arg_t;
 
 static void
 zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
 {
 	snapshot_setup_arg_t *ssa = arg;
 	sfs_node_t *node;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	node = sfs_alloc_node(sizeof(sfs_node_t),
 	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
 	zfsctl_common_vnode_setup(vp, node);
 
 	/* We have to support recursive locking. */
 	VN_LOCK_AREC(vp);
 }
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  * There are four possibilities:
  * - the snapshot node and vnode do not exist
  * - the snapshot vnode is covered by the mounted snapshot
  * - the snapshot vnode is not covered yet, the mount operation is in progress
  * - the snapshot vnode is not covered, because the snapshot has been unmounted
  * The last two states are transient and should be relatively short-lived.
  */
 int
 zfsctl_snapdir_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	vnode_t *dvp = ap->a_dvp;
 	vnode_t **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	char name[NAME_MAX + 1];
 	char fullname[ZFS_MAX_DATASET_NAME_LEN];
 	char *mountpoint;
 	size_t mountpoint_len;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	uint64_t snap_id;
 	int nameiop = cnp->cn_nameiop;
 	int lkflags = cnp->cn_lkflags;
 	int flags = cnp->cn_flags;
 	int err;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 		return (SET_ERROR(ENOTSUP));
 
 	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
 		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
 		if (err == 0)
 			*vpp = dvp;
 		return (err);
 	}
 	if (flags & ISDOTDOT) {
 		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
 		    vpp);
 		return (err);
 	}
 
 	if (cnp->cn_namelen >= sizeof(name))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
 	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
 	if (err != 0)
 		return (SET_ERROR(ENOENT));
 
 	for (;;) {
 		snapshot_setup_arg_t ssa;
 
 		ssa.snap_name = name;
 		ssa.snap_id = snap_id;
 		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
 		   snap_id, "zfs", &zfsctl_ops_snapshot,
 		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
 		if (err != 0)
 			return (err);
 
 		/* Check if a new vnode has just been created. */
 		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
 			break;
 
 		/*
 		 * Check if a snapshot is already mounted on top of the vnode.
 		 */
 		err = zfsctl_mounted_here(vpp, lkflags);
 		if (err != EJUSTRETURN)
 			return (err);
 
 		/*
 		 * If the vnode is not covered, then either the mount operation
 		 * is in progress or the snapshot has already been unmounted
 		 * but the vnode hasn't been inactivated and reclaimed yet.
 		 * We can try to re-use the vnode in the latter case.
 		 */
 		VI_LOCK(*vpp);
 		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
 			/* Upgrade to exclusive lock in order to:
 			 * - avoid race conditions
 			 * - satisfy the contract of mount_snapshot()
 			 */
 			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
 			if (err == 0)
 				break;
 		} else {
 			VI_UNLOCK(*vpp);
 		}
 
 		/*
 		 * In this state we can loop on uncontested locks and starve
 		 * the thread doing the lengthy, non-trivial mount operation.
 		 * So, yield to prevent that from happening.
 		 */
 		vput(*vpp);
 		kern_yield(PRI_USER);
 	}
 
 	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
 
 	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
 	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len,
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
 
 	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*
 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
 		 *
 		 * This is where we lie about our v_vfsp in order to
 		 * make .zfs/snapshot/<snapname> accessible over NFS
 		 * without requiring manual mounts of <snapname>.
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
 
 		/* Clear the root flag (set via VFS_ROOT) as well. */
 		(*vpp)->v_vflag &= ~VV_ROOT;
 	}
 
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfsctl_snapdir_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	struct dirent entry;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	uio_t *uio = ap->a_uio;
 	int *eofp = ap->a_eofflag;
 	off_t dots_offset;
 	int error;
 
 	ASSERT(vp->v_type == VDIR);
 
 	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
 	    &dots_offset);
 	if (error != 0) {
 		if (error == ENAMETOOLONG) /* ran out of destination space */
 			error = 0;
 		return (error);
 	}
 
 	ZFS_ENTER(zfsvfs);
 	for (;;) {
 		uint64_t cookie;
 		uint64_t id;
 
 		cookie = uio->uio_offset - dots_offset;
 
 		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
 		    snapname, &id, &cookie, NULL);
 		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		if (error != 0) {
 			if (error == ENOENT) {
 				if (eofp != NULL)
 					*eofp = 1;
 				error = 0;
 			}
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		entry.d_fileno = id;
 		entry.d_type = DT_DIR;
 		strcpy(entry.d_name, snapname);
 		entry.d_namlen = strlen(entry.d_name);
 		entry.d_reclen = sizeof(entry);
 		/* NOTE: d_off is the offset for the *next* entry. */
 		entry.d_off = cookie + dots_offset;
 		dirent_terminate(&entry);
 		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
 		if (error != 0) {
 			if (error == ENAMETOOLONG)
 				error = 0;
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(error));
 		}
 		uio->uio_offset = cookie + dots_offset;
 	}
 	/* NOTREACHED */
 }
 
 static int
 zfsctl_snapdir_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
 	sfs_node_t *node = vp->v_data;
 	uint64_t snap_count;
 	int err;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
 	vap->va_mtime = vap->va_ctime;
 	vap->va_birthtime = vap->va_ctime;
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 		if (err != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		vap->va_nlink += snap_count;
 	}
 	vap->va_size = vap->va_nlink;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static struct vop_vector zfsctl_ops_snapdir = {
 	.vop_default =	&default_vnodeops,
 	.vop_open =	zfsctl_common_open,
 	.vop_close =	zfsctl_common_close,
 	.vop_getattr =	zfsctl_snapdir_getattr,
 	.vop_access =	zfsctl_common_access,
 	.vop_readdir =	zfsctl_snapdir_readdir,
 	.vop_lookup =	zfsctl_snapdir_lookup,
 	.vop_reclaim =	zfsctl_common_reclaim,
 	.vop_fid =	zfsctl_common_fid,
 	.vop_print =	zfsctl_common_print,
 	.vop_pathconf =	zfsctl_common_pathconf,
 	.vop_getacl =	zfsctl_common_getacl,
 };
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
 
 static int
 zfsctl_snapshot_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	VERIFY(vrecycle(vp) == 1);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	void *data = vp->v_data;
 
 	sfs_reclaim_vnode(vp);
 	sfs_destroy_node(data);
 	return (0);
 }
 
 static int
 zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct mount *mp;
 	vnode_t *dvp;
 	vnode_t *vp;
 	sfs_node_t *node;
 	size_t len;
 	enum vgetstate vs;
 	int locked;
 	int error;
 
 	vp = ap->a_vp;
 	node = vp->v_data;
 	len = strlen(node->sn_name);
 	if (*ap->a_buflen < len)
 		return (SET_ERROR(ENOMEM));
 
 	/*
 	 * Prevent unmounting of the snapshot while the vnode lock
 	 * is not held.  That is not strictly required, but allows
 	 * us to assert that an uncovered snapshot vnode is never
 	 * "leaked".
 	 */
 	mp = vp->v_mountedhere;
 	if (mp == NULL)
 		return (SET_ERROR(ENOENT));
 	error = vfs_busy(mp, 0);
 	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
 
 	/*
 	 * We can vput the vnode as we can now depend on the reference owned
 	 * by the busied mp.  But we also need to hold the vnode, because
 	 * the reference may go after vfs_unbusy() which has to be called
 	 * before we can lock the vnode again.
 	 */
 	locked = VOP_ISLOCKED(vp);
 	vs = vget_prep(vp);
 	vput(vp);
 
 	/* Look up .zfs/snapshot, our parent. */
 	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
 	if (error == 0) {
 		VOP_UNLOCK(dvp, 0);
 		*ap->a_vpp = dvp;
 		*ap->a_buflen -= len;
 		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
 	}
 	vfs_unbusy(mp);
 	vget_finish(vp, locked | LK_RETRY, vs);
 	return (error);
 }
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static struct vop_vector zfsctl_ops_snapshot = {
 	.vop_default =		NULL, /* ensure very restricted access */
 	.vop_inactive =		zfsctl_snapshot_inactive,
 	.vop_need_inactive =	vop_stdneed_inactive,
 	.vop_reclaim =		zfsctl_snapshot_reclaim,
 	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
 	.vop_lock1 =		vop_stdlock,
 	.vop_unlock =		vop_stdunlock,
 	.vop_islocked =		vop_stdislocked,
 	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
 	.vop_print =		zfsctl_common_print,
 };
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	struct mount *mp;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *vp;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	*zfsvfsp = NULL;
 	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
 	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
 	if (error == 0 && vp != NULL) {
 		/*
 		 * XXX Probably need to at least reference, if not busy, the mp.
 		 */
 		if (vp->v_mountedhere != NULL)
 			*zfsvfsp = vp->v_mountedhere->mnt_data;
 		vput(vp);
 	}
 	if (*zfsvfsp == NULL)
 		return (SET_ERROR(EINVAL));
 	return (0);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	struct mount *mp;
 	vnode_t *dvp;
 	vnode_t *vp;
 	sfs_node_t *node;
 	sfs_node_t *snap;
 	uint64_t cookie;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 
 	cookie = 0;
 	for (;;) {
 		uint64_t id;
 
 		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
 		    snapname, &id, &cookie, NULL);
 		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		if (error != 0) {
 			if (error == ENOENT)
 				error = 0;
 			break;
 		}
 
 		for (;;) {
 			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
 			    ZFSCTL_INO_SNAPDIR, id, &vp);
 			if (error != 0 || vp == NULL)
 				break;
 
 			mp = vp->v_mountedhere;
 
 			/*
 			 * v_mountedhere being NULL means that the
 			 * (uncovered) vnode is in a transient state
 			 * (mounting or unmounting), so loop until it
 			 * settles down.
 			 */
 			if (mp != NULL)
 				break;
 			vput(vp);
 		}
 		if (error != 0)
 			break;
 		if (vp == NULL)
 			continue;	/* no mountpoint, nothing to do */
 
 		/*
 		 * The mount-point vnode is kept locked to avoid spurious EBUSY
 		 * from a concurrent umount.
 		 * The vnode lock must have recursive locking enabled.
 		 */
 		vfs_ref(mp);
 		error = dounmount(mp, fflags, curthread);
 		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
 		    ("extra references after unmount"));
 		vput(vp);
 		if (error != 0)
 			break;
 	}
 	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
 	    ("force unmounting failed"));
 	return (error);
 }
 
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 355789)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	(revision 355790)
@@ -1,6026 +1,6029 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <sys/vmmeter.h>
 #include <vm/vm_param.h>
 #include <sys/zil.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	offset_t ndata;
 	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 	{
 #ifdef illumos
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		off = *(offset_t *)data;
 #endif
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 #ifdef illumos
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 #else
 		*(offset_t *)data = off;
 #endif
 		return (0);
 	}
 #ifdef illumos
 	case _FIO_COUNT_FILLED:
 	{
 		/*
 		 * _FIO_COUNT_FILLED adds a new ioctl command which
 		 * exposes the number of filled blocks in a
 		 * ZFS object.
 		 */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/*
 		 * Wait for all dirty blocks for this object
 		 * to get synced out to disk, and the DMU info
 		 * updated.
 		 */
 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Retrieve fill count from DMU object.
 		 */
 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		ndata = doi.doi_fill_count;
 
 		ZFS_EXIT(zfsvfs);
 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 #endif
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considred clean despite have some dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
 	    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 	if (pp != NULL) {
 		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 		vm_object_pip_add(obj, 1);
 		pmap_remove_write(pp);
 		if (nbytes != 0)
 			vm_page_clear_dirty(pp, off, nbytes);
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 	vm_object_pip_wakeup(pp->object);
 }
 
 static vm_page_t
 page_wire(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT |
 	    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY);
 	return (m);
 }
 
 static void
 page_unwire(vm_page_t pp)
 {
 
 	vm_page_unwire(pp, PQ_ACTIVE);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
     int segflg, dmu_tx_t *tx)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	caddr_t va;
 	int off;
 
 	ASSERT(segflg != UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	vm_object_pip_add(obj, 1);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, oid, start+off, nbytes,
 			    va+off, DMU_READ_PREFETCH);;
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	vm_object_pip_wakeup(obj);
 	zfs_vmobject_wunlock(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 static int
 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(uio->uio_segflg == UIO_NOCOPY);
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
 
 	zfs_vmobject_wlock(obj);
 	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (vm_page_none_valid(pp)) {
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				bzero(va + bytes, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			if (error == 0) {
 				vm_page_valid(pp);
 				vm_page_lock(pp);
 				vm_page_activate(pp);
 				vm_page_unlock(pp);
 			}
 			vm_page_sunbusy(pp);
 			if (error != 0 && !vm_page_wired(pp) &&
 			    pp->valid == 0 && vm_page_tryxbusy(pp))
 				vm_page_free(pp);
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
 		uio->uio_resid -= bytes;
 		uio->uio_offset += bytes;
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	vm_object_t obj;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT(vp->v_mount != NULL);
 	obj = vp->v_object;
 	ASSERT(obj != NULL);
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_wire(vp, start)) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock(obj);
 			va = zfs_map_page(pp, &sf);
 #ifdef illumos
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 #else
 			error = vn_io_fault_uiomove(va + off, bytes, uio);
 #endif
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock(obj);
 			page_unwire(pp);
 		} else {
 			zfs_vmobject_wunlock(obj);
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 			zfs_vmobject_wlock(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock(obj);
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (zfsvfs->z_log &&
 	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
 	    uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 #ifdef illumos
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 #endif	/* illumos */
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 #ifdef __FreeBSD__
 		if (uio->uio_segflg == UIO_NOCOPY)
 			error = mappedread_sf(vp, nbytes, uio);
 		else
 #endif /* __FreeBSD__ */
 		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	rangelock_exit(lr);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
  *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = MAXOFFSET_T;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM.
 	 * Intentionally allow ZFS_READONLY through here.
 	 * See zfs_zaccess_common()
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling rangelock_enter()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef illumos
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	locked_range_t *lr;
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 		woff = lr->lr_offset;
 		if (lr->lr_length == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 	}
 
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
 		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (EFBIG);
 	}
 
 	if (woff >= limit) {
 		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If rangelock_enter() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since rangelock_reduce() will
 		 * shrink down lr_length to the appropriate size.
 		 */
 		if (lr->lr_length == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			rangelock_reduce(lr, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (woff + nbytes > zp->z_size)
 			vnode_pager_setsize(vp, woff + nbytes);
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
 			    zp->z_id, uio->uio_segflg, tx);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(vp, cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 #ifdef illumos
 			ASSERT(error == 0);
 #else
 			ASSERT(error == 0 || error == EFAULT);
 #endif
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		if (error == 0)
 			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		else
 			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 #ifdef illumos
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 #endif
 	}
 
 	rangelock_exit(lr);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 #ifdef __FreeBSD__
 	/*
 	 * EFAULT means that at least one page of the source buffer was not
 	 * available.  VFS will re-try remaining I/O upon this error.
 	 */
 	if (error == EFAULT) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 #endif
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	rangelock_exit(zgd->zgd_lr);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(zio, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
 		    offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
 			    offset, size, RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			rangelock_exit(zgd->zgd_lr);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				/*
 				 * TX_WRITE2 relies on the data previously
 				 * written by the TX_WRITE that caused
 				 * EALREADY.  We zero out the BP because
 				 * it is the old, currently-on-disk BP.
 				 */
 				zgd->zgd_bp = NULL;
 				BP_ZERO(bp);
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	int error;
 
 	*vpp = arg;
 	error = vn_lock(*vpp, lkflags);
 	if (error != 0)
 		vrele(*vpp);
 	return (error);
 }
 
 static int
 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int error;
 	int ltype;
 
 	ASSERT_VOP_LOCKED(dvp, __func__);
 #ifdef DIAGNOSTIC
 	if ((zdp->z_pflags & ZFS_XATTR) == 0)
 		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
 #endif
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		ASSERT3P(dvp, ==, vp);
 		vref(dvp);
 		ltype = lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(dvp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 			else /* if (ltype == LK_SHARED) */
 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 			/*
 			 * Relock for the "." case could leave us with
 			 * reclaimed vnode.
 			 */
 			if (VN_IS_DOOMED(dvp)) {
 				vrele(dvp);
 				return (SET_ERROR(ENOENT));
 			}
 		}
 		return (0);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 *
 		 * XXX Note that there is a time window when both vnodes are
 		 * unlocked.  It is possible, although highly unlikely, that
 		 * during that window the parent-child relationship between
 		 * the vnodes may change, for example, get reversed.
 		 * In that case we would have a wrong lock order for the vnodes.
 		 * All other filesystems seem to ignore this problem, so we
 		 * do the same here.
 		 * A potential solution could be implemented as follows:
 		 * - using LK_NOWAIT when locking the second vnode and retrying
 		 *   if necessary
 		 * - checking that the parent-child relationship still holds
 		 *   after locking both vnodes and retrying if it doesn't
 		 */
 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 		return (error);
 	} else {
 		error = vn_lock(vp, lkflags);
 		if (error != 0)
 			vrele(vp);
 		return (error);
 	}
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
     int nameiop, cred_t *cr, kthread_t *td, int flags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 #ifdef TODO
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 #endif
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			vrele(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 
 	/*
 	 * First handle the special cases.
 	 */
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the vp for the snapshot directory.
 		 */
 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 			struct componentname cn;
 			vnode_t *zfsctl_vp;
 			int ltype;
 
 			ZFS_EXIT(zfsvfs);
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp, 0);
 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 			    &zfsctl_vp);
 			if (error == 0) {
 				cn.cn_nameptr = "snapshot";
 				cn.cn_namelen = strlen(cn.cn_nameptr);
 				cn.cn_nameiop = cnp->cn_nameiop;
 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 				cn.cn_lkflags = cnp->cn_lkflags;
 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 				vput(zfsctl_vp);
 			}
 			vn_lock(dvp, ltype | LK_RETRY);
 			return (error);
 		}
 	}
 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 		ZFS_EXIT(zfsvfs);
 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 			return (SET_ERROR(ENOTSUP));
 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 		return (error);
 	}
 
 	/*
 	 * The loop is retry the lookup if the parent-child relationship
 	 * changes during the dot-dot locking complexities.
 	 */
 	for (;;) {
 		uint64_t parent;
 
 		error = zfs_dirlook(zdp, nm, &zp);
 		if (error == 0)
 			*vpp = ZTOV(zp);
 
 		ZFS_EXIT(zfsvfs);
 		if (error != 0)
 			break;
 
 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 		if (error != 0) {
 			/*
 			 * If we've got a locking error, then the vnode
 			 * got reclaimed because of a force unmount.
 			 * We never enter doomed vnodes into the name cache.
 			 */
 			*vpp = NULL;
 			return (error);
 		}
 
 		if ((cnp->cn_flags & ISDOTDOT) == 0)
 			break;
 
 		ZFS_ENTER(zfsvfs);
 		if (zdp->z_sa_hdl == NULL) {
 			error = SET_ERROR(EIO);
 		} else {
 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 			    &parent, sizeof (parent));
 		}
 		if (error != 0) {
 			ZFS_EXIT(zfsvfs);
 			vput(ZTOV(zp));
 			break;
 		}
 		if (zp->z_id == parent) {
 			ZFS_EXIT(zfsvfs);
 			break;
 		}
 		vput(ZTOV(zp));
 	}
 
 out:
 	if (error != 0)
 		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 				cnp->cn_flags |= SAVENAME;
 				break;
 			}
 			/* FALLTHROUGH */
 		case DELETE:
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 			break;
 		}
 	}
 
 	/* Insert name into cache (as non-existent) if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULL, cnp);
 
 	/* Insert name into cache if appropriate. */
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
     vnode_t **vpp, cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	void		*vsecp = NULL;
 	int		flag = 0;
 	uint64_t	txtype;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	*vpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		goto out;
 	}
 
 	/*
 	 * We only support the creation of regular files in
 	 * extended attribute directories.
 	 */
 
 	if ((dzp->z_pflags & ZFS_XATTR) &&
 	    (vap->va_type != VREG)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids)) != 0)
 		goto out;
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	getnewvnode_reserve(1);
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 out:
 	if (error == 0) {
 		*vpp = ZTOV(zp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 	zp = VTOZ(vp);
 
 	xattr_obj = 0;
 	xzp = NULL;
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	obj = zp->z_id;
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 
 	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 		vp->v_vflag |= VV_NOSYNC;
 	}
 
 	txtype = TX_REMOVE;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 
 	if (xzp)
 		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	ZFS_VERIFY_ZP(zp);
 	zilog = zfsvfs->z_log;
 
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	cache_purge(dvp);
 
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	cache_purge(vp);
 out:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 	uint8_t		type;
 	int		ncooks;
 	u_long		*cooks = NULL;
 	int		flags = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 #ifdef TODO
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 #else
 	check_sysattrs = 0;
 #endif
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 #ifdef TODO
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 #else
 				panic("%s:%u: TODO", __func__, __LINE__);
 #endif
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				vrele(ZTOV(ezp));
 				goto skip_entry;
 			}
 			vrele(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
 			/* NOTE: ed_off is the offset for the *next* entry. */
 			next = &eodp->ed_off;
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			odp->d_namlen = strlen(zap.za_name);
 			/* NOTE: d_off is the offset for the *next* entry. */
 			next = &odp->d_off;
 			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 			odp->d_type = type;
 			dirent_terminate(odp);
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		/* Fill the offset right after advancing the cursor. */
 		if (next != NULL)
 			*next = offset;
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 #ifdef illumos
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 #else
 	vn_fsid(vp, vap);
 #endif
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = zp->z_links;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
 	    zp->z_links < ZFS_LINK_MAX)
 		vap->va_nlink++;
 	vap->va_size = zp->z_size;
 #ifdef illumos
 	vap->va_rdev = vp->v_rdev;
 #else
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 #endif
 	vap->va_seq = zp->z_seq;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
      	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
 	 */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
 	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EOVERFLOW));
 	}
 
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err == 0) {
 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
 				if (err != 0)
 					vrele(ZTOV(attrzp));
 			}
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, 0);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
     struct vnode *tdvp, struct vnode **tvpp,
     const struct componentname *scnp, const struct componentname *tcnp)
 {
 	zfsvfs_t	*zfsvfs;
 	struct vnode	*nvp, *svp, *tvp;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	const char	*snm = scnp->cn_nameptr;
 	const char	*tnm = tcnp->cn_nameptr;
 	int error;
 
 	VOP_UNLOCK(tdvp, 0);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK(*tvpp, 0);
 
 relock:
 	error = vn_lock(sdvp, LK_EXCLUSIVE);
 	if (error)
 		goto out;
 	sdzp = VTOZ(sdvp);
 
 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		if (error != EBUSY)
 			goto out;
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto out;
 		VOP_UNLOCK(tdvp, 0);
 		goto relock;
 	}
 	tdzp = VTOZ(tdvp);
 
 	/*
 	 * Before using sdzp and tdzp we must ensure that they are live.
 	 * As a porting legacy from illumos we have two things to worry
 	 * about.  One is typical for FreeBSD and it is that the vnode is
 	 * not reclaimed (doomed).  The other is that the znode is live.
 	 * The current code can invalidate the znode without acquiring the
 	 * corresponding vnode lock if the object represented by the znode
 	 * and vnode is no longer valid after a rollback or receive operation.
 	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
 	 * that protects the znodes from the invalidation.
 	 */
 	zfsvfs = sdzp->z_zfsvfs;
 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		error = SET_ERROR(EIO);
 		goto out;
 	}
 
 	/*
 	 * Re-resolve svp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
 	if (error != 0) {
 		/* Source entry invalid or not there. */
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	svp = ZTOV(szp);
 
 	/*
 	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
 	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		vrele(svp);
 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	if (tzp != NULL)
 		tvp = ZTOV(tzp);
 	else
 		tvp = NULL;
 
 	/*
 	 * At present the vnode locks must be acquired before z_teardown_lock,
 	 * although it would be more logical to use the opposite order.
 	 */
 	ZFS_EXIT(zfsvfs);
 
 	/*
 	 * Now try acquire locks on svp and tvp.
 	 */
 	nvp = svp;
 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (tvp != NULL)
 			vrele(tvp);
 		if (error != EBUSY) {
 			vrele(nvp);
 			goto out;
 		}
 		error = vn_lock(nvp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vrele(nvp);
 			goto out;
 		}
 		VOP_UNLOCK(nvp, 0);
 		/*
 		 * Concurrent rename race.
 		 * XXX ?
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		vrele(*svpp);
 		*svpp = nvp;
 		goto relock;
 	}
 	vrele(*svpp);
 	*svpp = nvp;
 
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	*tvpp = NULL;
 	if (tvp != NULL) {
 		nvp = tvp;
 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error != 0) {
 			VOP_UNLOCK(sdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(*svpp, 0);
 			if (error != EBUSY) {
 				vrele(nvp);
 				goto out;
 			}
 			error = vn_lock(nvp, LK_EXCLUSIVE);
 			if (error != 0) {
 				vrele(nvp);
 				goto out;
 			}
 			vput(nvp);
 			goto relock;
 		}
 		*tvpp = nvp;
 	}
 
 	return (0);
 
 out:
 	return (error);
 }
 
 /*
  * Note that we must use VRELE_ASYNC in this function as it walks
  * up the directory tree and vrele may need to acquire an exclusive
  * lock if a last reference to a vnode is dropped.
  */
 static int
 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*zp, *zp1;
 	uint64_t	parent;
 	int		error;
 
 	zfsvfs = tdzp->z_zfsvfs;
 	if (tdzp == szp)
 		return (SET_ERROR(EINVAL));
 	if (tdzp == sdzp)
 		return (0);
 	if (tdzp->z_id == zfsvfs->z_root)
 		return (0);
 	zp = tdzp;
 	for (;;) {
 		ASSERT(!zp->z_unlinked);
 		if ((error = sa_lookup(zp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			break;
 
 		if (parent == szp->z_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		if (parent == zfsvfs->z_root)
 			break;
 		if (parent == sdzp->z_id)
 			break;
 
 		error = zfs_zget(zfsvfs, parent, &zp1);
 		if (error != 0)
 			break;
 
 		if (zp != tdzp)
 			VN_RELE_ASYNC(ZTOV(zp),
 			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 		zp = zp1;
 	}
 
 	if (error == ENOTDIR)
 		panic("checkpath: .. not a directory\n");
 	if (zp != tdzp)
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 	return (error);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	zilog_t		*zilog = NULL;
 	dmu_tx_t	*tx;
 	char		*snm = scnp->cn_nameptr;
 	char		*tnm = tcnp->cn_nameptr;
 	int		error = 0;
 
 	/* Reject renames across filesystems. */
 	if ((*svpp)->v_mount != tdvp->v_mount ||
 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	if (zfsctl_is_node(tdvp)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
 	if (error != 0) {
 		/* no vnodes are locked in the case of error here */
 		return (error);
 	}
 
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 	zfsvfs = tdzp->z_zfsvfs;
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * After we re-enter ZFS_ENTER() we will have to revalidate all
 	 * znodes involved.
 	 */
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		error = SET_ERROR(EILSEQ);
 		goto unlockout;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if ((*svpp) == (*tvpp)) {
 		error = 0;
 		goto unlockout;
 	}
 
 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
 	    (*tvpp)->v_mountedhere != NULL)) {
 		error = SET_ERROR(EXDEV);
 		goto unlockout;
 	}
 
 	/*
 	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
 	 * bypassing the cleanup code in the case of an error.
 	 */
 	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	szp = VTOZ(*svpp);
 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
 	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
 		error = SET_ERROR(EIO);
 		goto unlockout;
 	}
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		error = SET_ERROR(EINVAL);
 		goto unlockout;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto unlockout;
 
 	if ((*svpp)->v_type == VDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
 		    sdzp == szp ||
 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_check(szp, sdzp, tdzp))
 			goto unlockout;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if ((*svpp)->v_type == VDIR) {
 			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto unlockout;
 			} else {
 				cache_purge(tdvp);
 				if (sdvp != tdvp)
 					cache_purge(sdvp);
 			}
 		} else {
 			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto unlockout;
 			}
 		}
 	}
 
 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto unlockout;
 	}
 
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
 			    NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 				    snm, tdzp, tnm, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 		if (error == 0) {
 			cache_purge(*svpp);
 			if (*tvpp != NULL)
 				cache_purge(*tvpp);
 			cache_purge_negative(tdvp);
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
 	ZFS_EXIT(zfsvfs);
 	VOP_UNLOCK(*svpp, 0);
 	VOP_UNLOCK(sdvp, 0);
 
 out:				/* original two vnodes are locked */
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (*tvpp != NULL)
 		VOP_UNLOCK(*tvpp, 0);
 	if (tdvp != *tvpp)
 		VOP_UNLOCK(tdvp, 0);
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
     cred_t *cr, kthread_t *td)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	int		flags = 0;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve(1);
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
 
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	*vpp = ZTOV(zp);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			dmu_tx_commit(tx);
 		}
 	}
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 
 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 
 #ifdef illumos
 	if (fidp->fid_len < size) {
 		fidp->fid_len = size;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 #else
 	fidp->fid_len = size;
 #endif
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 #ifdef illumos
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lookup(zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED);
 		if (error == 0) {
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			vrele(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 #endif	/* illumos */
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 #ifdef illumos
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 #endif
 	case _PC_ACL_EXTENDED:
 		*valp = 0;
 		return (0);
 
 	case _PC_ACL_NFS4:
 		*valp = 1;
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	locked_range_t *lr;
 	vm_object_t object;
 	off_t start, end, obj_size;
 	uint_t blksz;
 	int pgsin_b, pgsin_a;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	start = IDX_TO_OFF(ma[0]->pindex);
 	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
 	/*
 	 * Lock a range covering all required and optional pages.
 	 * Note that we need to handle the case of the block size growing.
 	 */
 	for (;;) {
 		blksz = zp->z_blksz;
 		lr = rangelock_enter(&zp->z_rangelock, rounddown(start, blksz),
 		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
 		if (blksz == zp->z_blksz)
 			break;
 		rangelock_exit(lr);
 	}
 
 	object = ma[0]->object;
 	zfs_vmobject_wlock(object);
 	obj_size = object->un_pager.vnp.vnp_size;
 	zfs_vmobject_wunlock(object);
 	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
 		rangelock_exit(lr);
 		ZFS_EXIT(zfsvfs);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	pgsin_b = 0;
 	if (rbehind != NULL) {
 		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
 		pgsin_b = MIN(*rbehind, pgsin_b);
 	}
 
 	pgsin_a = 0;
 	if (rahead != NULL) {
 		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
 		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
 			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
 	/*
 	 * NB: we need to pass the exact byte size of the data that we expect
 	 * to read after accounting for the file size.  This is required because
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
 	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
 	    MIN(end, obj_size) - (end - PAGE_SIZE));
 
 	rangelock_exit(lr);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 
 	if (error != 0)
 		return (zfs_vm_pagerret_error);
 
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	return (zfs_vm_pagerret_ok);
 }
 
 static int
 zfs_freebsd_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	locked_range_t	*lr;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	object = vp->v_object;
 	pcount = btoc(len);
 	ncount = pcount;
 
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		goto out;
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
 			rtvals[i] = zfs_vm_pagerret_ok;
 			vm_page_undirty(ma[i]);
 		}
 		zfs_vmobject_wunlock(object);
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	rangelock_exit(lr);
 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (rtvals[0]);
 }
 
 int
 zfs_freebsd_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 static int
 zfs_freebsd_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 static int
 zfs_freebsd_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 static int
 zfs_freebsd_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int a_fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL, NULL));
 }
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= FAPPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= FNONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= (FSYNC | FDSYNC | FRSYNC);
 
 	return (flags);
 }
 
 static int
 zfs_freebsd_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
 	    ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 static int
 zfs_freebsd_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT(cnp->cn_namelen < sizeof(nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, cnp->cn_thread, 0));
 }
 
 static int
 zfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	if (zfsvfs->z_use_namecache)
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap));
 }
 
 static int
 zfs_freebsd_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	int error, mode;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 
 	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
 	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
 	if (zfsvfs->z_use_namecache &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (error);
 }
 
 static int
 zfs_freebsd_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 
 	vattr_init_mask(vap);
 
 	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
 	    ap->a_cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 static int
 zfs_freebsd_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 
 	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 static int
 zfs_freebsd_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	vop_stdfsync(ap);
 	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
 }
 
 static int
 zfs_freebsd_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	u_long fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 static int
 zfs_freebsd_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	u_long fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM 
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		     UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on objects they
 			 * have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_reparse);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	if (vap->va_birthtime.tv_sec != VNOVAL) {
 		xvap.xva_vattr.va_mask |= AT_XVATTR;
 		XVA_SET_REQ(&xvap, XAT_CREATETIME);
 	}
 	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 static int
 zfs_freebsd_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 
 	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 static int
 zfs_freebsd_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 
 	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
 	    __DECONST(char *, ap->a_target), cnp->cn_cred, cnp->cn_thread));
 }
 
 static int
 zfs_freebsd_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
 }
 
 static int
 zfs_freebsd_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 	ASSERT(cnp->cn_flags & SAVENAME);
 
 	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
 }
 
 static int
 zfs_freebsd_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 	return (0);
 }
 
 static int
 zfs_freebsd_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp != NULL);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 zfs_freebsd_fid(ap)
 	struct vop_fid_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fid;
 	} */ *ap;
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 static int
 zfs_freebsd_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
 	if (error == 0) {
 		*ap->a_retval = val;
 		return (error);
 	}
 	if (error != EOPNOTSUPP)
 		return (error);
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	PREFIX	
  *	system		freebsd:system:
  *	user		(none, can be used to access ZFS fsattr(5) attributes
  *			created on Solaris)
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size)
 {
 	const char *namespace, *prefix, *suffix;
 
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (EINVAL);
 	/* We don't allow attribute names that start with "freebsd:" string. */
 	if (strncmp(name, "freebsd:", 8) == 0)
 		return (EINVAL);
 
 	bzero(attrname, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 #if 0
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_USER_STRING;
 		suffix = ":";
 #else
 		/*
 		 * This is the default namespace by which we can access all
 		 * attributes created on Solaris.
 		 */
 		prefix = namespace = suffix = "";
 #endif
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (EINVAL);
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (ENAMETOOLONG);
 	}
 	return (0);
 }
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FREAD;
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (error == ENOENT)
 			error = ENOATTR;
 		return (error);
 	}
 
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrname[255];
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof(attrname));
 	if (error != 0)
 		return (error);
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
 	    NULL);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, flags, ap->a_cred, td);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	char attrprefix[16];
 	u_char dirbuf[sizeof(struct dirent)];
 	struct dirent *dp;
 	struct iovec aiov;
 	struct uio auio, *uio = ap->a_uio;
 	size_t *sizep = ap->a_size;
 	size_t plen;
 	vnode_t *xvp = NULL, *vp;
 	int done, error, eof, pos;
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (error);
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof(attrprefix));
 	if (error != 0)
 		return (error);
 	plen = strlen(attrprefix);
 
 	ZFS_ENTER(zfsvfs);
 
 	if (sizep != NULL)
 		*sizep = 0;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
 	    LOOKUP_XATTR);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 	error = namei(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	do {
 		u_char nlen;
 
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof(dirbuf);
 		auio.uio_resid = sizeof(dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		done = sizeof(dirbuf) - auio.uio_resid;
 		if (error != 0)
 			break;
 		for (pos = 0; pos < done;) {
 			dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			nlen = dp->d_namlen - plen;
 			if (sizep != NULL)
 				*sizep += 1 + nlen;
 			else if (uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, uio->uio_rw, uio);
 				if (error == 0) {
 					error = uiomove(dp->d_name + plen, nlen,
 					    uio->uio_rw, uio);
 				}
 				if (error != 0)
 					break;
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 int
 zfs_freebsd_getacl(ap)
 	struct vop_getacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 int
 zfs_freebsd_setacl(ap)
 	struct vop_setacl_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	int		error;
 	vsecattr_t      vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp == NULL)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 int
 zfs_freebsd_aclcheck(ap)
 	struct vop_aclcheck_args /* {
 		struct vnode *vp;
 		acl_type_t type;
 		struct acl *aclp;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	vnode_t *covered_vp;
 	vnode_t *vp = ap->a_vp;;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *zp = VTOZ(vp);
 	enum vgetstate vs;
 	int ltype;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * If we are a snapshot mounted under .zfs, run the operation
 	 * on the covered vnode.
 	 */
 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
 		char name[MAXNAMLEN + 1];
 		znode_t *dzp;
 		size_t len;
 
 		error = zfs_znode_parent_and_name(zp, &dzp, name);
 		if (error == 0) {
 			len = strlen(name);
 			if (*ap->a_buflen < len)
 				error = SET_ERROR(ENOMEM);
 		}
 		if (error == 0) {
 			*ap->a_buflen -= len;
 			bcopy(name, ap->a_buf + *ap->a_buflen, len);
 			*ap->a_vpp = ZTOV(dzp);
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	ZFS_EXIT(zfsvfs);
 
 	covered_vp = vp->v_mount->mnt_vnodecovered;
 	vs = vget_prep(covered_vp);
 	ltype = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	error = vget_finish(covered_vp, LK_SHARED, vs);
 	if (error == 0) {
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
 		    ap->a_buf, ap->a_buflen);
 		vput(covered_vp);
 	}
 	vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp))
 		error = SET_ERROR(ENOENT);
 	return (error);
 }
 
 #ifdef DIAGNOSTIC
 static int
 zfs_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	vnode_t *vp;
 	znode_t *zp;
 	int err;
 
 	err = vop_lock(ap);
 	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
 		vp = ap->a_vp;
 		zp = vp->v_data;
 		if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) &&
 		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
 			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
 	}
 	return (err);
 }
 #endif
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_access =		zfs_freebsd_access,
 	.vop_allocate =		VOP_EINVAL,
 	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_lookup,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
 #ifdef DIAGNOSTIC
 	.vop_lock1 =		zfs_lock,
 #else
 	.vop_lock1 =		vop_lock,
 #endif
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 };
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 };
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 };
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
Index: head/sys/fs/autofs/autofs_vnops.c
===================================================================
--- head/sys/fs/autofs/autofs_vnops.c	(revision 355789)
+++ head/sys/fs/autofs/autofs_vnops.c	(revision 355790)
@@ -1,714 +1,715 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Edward Tomasz Napierala under sponsorship
  * from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/condvar.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 #include <vm/uma.h>
 
 #include <fs/autofs/autofs.h>
 
 static int	autofs_trigger_vn(struct vnode *vp, const char *path,
 		    int pathlen, struct vnode **newvp);
 
 extern struct autofs_softc	*autofs_softc;
 
 static int
 autofs_access(struct vop_access_args *ap)
 {
 
 	/*
 	 * Nothing to do here; the only kind of access control
 	 * needed is in autofs_mkdir().
 	 */
 
 	return (0);
 }
 
 static int
 autofs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp, *newvp;
 	struct autofs_node *anp;
 	struct mount *mp;
 	struct vattr *vap;
 	int error;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 	mp = vp->v_mount;
 	vap = ap->a_vap;
 
 	KASSERT(ap->a_vp->v_type == VDIR, ("!VDIR"));
 
 	/*
 	 * The reason we must do this is that some tree-walking software,
 	 * namely fts(3), assumes that stat(".") results will not change
 	 * between chdir("subdir") and chdir(".."), and fails with ENOENT
 	 * otherwise.
 	 */
 	if (autofs_mount_on_stat && autofs_cached(anp, NULL, 0) == false &&
 	    autofs_ignore_thread(curthread) == false) {
 		error = autofs_trigger_vn(vp, "", 0, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			error = VOP_GETATTR(newvp, ap->a_vap,
 			    ap->a_cred);
 			vput(newvp);
 			return (error);
 		}
 	}
 
 	vap->va_type = VDIR;
 	vap->va_mode = 0755;
 	vap->va_nlink = 3; /* XXX */
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = NODEV;
 	vap->va_fsid = mp->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = anp->an_fileno;
 	vap->va_size = S_BLKSIZE;
 	vap->va_blocksize = S_BLKSIZE;
 	vap->va_mtime = anp->an_ctime;
 	vap->va_atime = anp->an_ctime;
 	vap->va_ctime = anp->an_ctime;
 	vap->va_birthtime = anp->an_ctime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	vap->va_bytes = S_BLKSIZE;
 	vap->va_filerev = 0;
 	vap->va_spare = 0;
 
 	return (0);
 }
 
 /*
  * Unlock the vnode, request automountd(8) action, and then lock it back.
  * If anything got mounted on top of the vnode, return the new filesystem's
  * root vnode in 'newvp', locked.
  */
 static int
 autofs_trigger_vn(struct vnode *vp, const char *path, int pathlen,
     struct vnode **newvp)
 {
 	struct autofs_node *anp;
 	int error, lock_flags;
 
 	anp = vp->v_data;
 
 	/*
 	 * Release the vnode lock, so that other operations, in partcular
 	 * mounting a filesystem on top of it, can proceed.  Increase use
 	 * count, to prevent the vnode from being deallocated and to prevent
 	 * filesystem from being unmounted.
 	 */
 	lock_flags = VOP_ISLOCKED(vp);
 	vref(vp);
 	VOP_UNLOCK(vp, 0);
 
 	sx_xlock(&autofs_softc->sc_lock);
 
 	/*
 	 * XXX: Workaround for mounting the same thing multiple times; revisit.
 	 */
 	if (vp->v_mountedhere != NULL) {
 		error = 0;
 		goto mounted;
 	}
 
 	error = autofs_trigger(anp, path, pathlen);
 mounted:
 	sx_xunlock(&autofs_softc->sc_lock);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	vunref(vp);
 	if (VN_IS_DOOMED(vp)) {
 		AUTOFS_DEBUG("VIRF_DOOMED");
 		return (ENOENT);
 	}
 
 	if (error != 0)
 		return (error);
 
 	if (vp->v_mountedhere == NULL) {
 		*newvp = NULL;
 		return (0);
 	} else {
 		/*
 		 * If the operation that succeeded was mount, then mark
 		 * the node as non-cached.  Otherwise, if someone unmounts
 		 * the filesystem before the cache times out, we will fail
 		 * to trigger.
 		 */
 		anp->an_cached = false;
 	}
 
 	error = VFS_ROOT(vp->v_mountedhere, lock_flags, newvp);
 	if (error != 0) {
 		AUTOFS_WARN("VFS_ROOT() failed with error %d", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 autofs_vget_callback(struct mount *mp, void *arg, int flags,
     struct vnode **vpp)
 {
 
 
 	return (autofs_node_vn(arg, mp, flags, vpp));
 }
 
 static int
 autofs_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp, *newvp, **vpp;
 	struct mount *mp;
 	struct autofs_mount *amp;
 	struct autofs_node *anp, *child;
 	struct componentname *cnp;
 	int error;
 
 	dvp = ap->a_dvp;
 	vpp = ap->a_vpp;
 	mp = dvp->v_mount;
 	amp = VFSTOAUTOFS(mp);
 	anp = dvp->v_data;
 	cnp = ap->a_cnp;
 
 	if (cnp->cn_flags & ISDOTDOT) {
 		KASSERT(anp->an_parent != NULL, ("NULL parent"));
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 */
 		error = vn_vget_ino_gen(dvp, autofs_vget_callback,
 		    anp->an_parent, cnp->cn_lkflags, vpp);
 		if (error != 0) {
 			AUTOFS_WARN("vn_vget_ino_gen() failed with error %d",
 			    error);
 			return (error);
 		}
 		return (error);
 	}
 
 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		vref(dvp);
 		*vpp = dvp;
 
 		return (0);
 	}
 
 	if (autofs_cached(anp, cnp->cn_nameptr, cnp->cn_namelen) == false &&
 	    autofs_ignore_thread(cnp->cn_thread) == false) {
 		error = autofs_trigger_vn(dvp,
 		    cnp->cn_nameptr, cnp->cn_namelen, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			/*
 			 * The target filesystem got automounted.
 			 * Let the lookup(9) go around with the same
 			 * path component.
 			 */
 			vput(newvp);
 			return (ERELOOKUP);
 		}
 	}
 
 	AUTOFS_SLOCK(amp);
 	error = autofs_node_find(anp, cnp->cn_nameptr, cnp->cn_namelen, &child);
 	if (error != 0) {
 		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
 			AUTOFS_SUNLOCK(amp);
 			return (EJUSTRETURN);
 		}
 
 		AUTOFS_SUNLOCK(amp);
 		return (ENOENT);
 	}
 
 	/*
 	 * XXX: Dropping the node here is ok, because we never remove nodes.
 	 */
 	AUTOFS_SUNLOCK(amp);
 
 	error = autofs_node_vn(child, mp, cnp->cn_lkflags, vpp);
 	if (error != 0) {
 		if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE)
 			return (EJUSTRETURN);
 
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 autofs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 	struct autofs_mount *amp;
 	struct autofs_node *child;
 	int error;
 
 	vp = ap->a_dvp;
 	anp = vp->v_data;
 	amp = VFSTOAUTOFS(vp->v_mount);
 
 	/*
 	 * Do not allow mkdir() if the calling thread is not
 	 * automountd(8) descendant.
 	 */
 	if (autofs_ignore_thread(curthread) == false)
 		return (EPERM);
 
 	AUTOFS_XLOCK(amp);
 	error = autofs_node_new(anp, amp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_namelen, &child);
 	if (error != 0) {
 		AUTOFS_XUNLOCK(amp);
 		return (error);
 	}
 	AUTOFS_XUNLOCK(amp);
 
 	error = autofs_node_vn(child, vp->v_mount, LK_EXCLUSIVE, ap->a_vpp);
 
 	return (error);
 }
 
 static int
 autofs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 
 	printf("    name \"%s\", fileno %d, cached %d, wildcards %d\n",
 	    anp->an_name, anp->an_fileno, anp->an_cached, anp->an_wildcards);
 
 	return (0);
 }
 
 /*
  * Write out a single 'struct dirent', based on 'name' and 'fileno' arguments.
  */
 static int
 autofs_readdir_one(struct uio *uio, const char *name, int fileno,
     size_t *reclenp)
 {
 	struct dirent dirent;
 	size_t namlen, reclen;
 	int error;
 
 	namlen = strlen(name);
 	reclen = _GENERIC_DIRLEN(namlen);
 	if (reclenp != NULL)
 		*reclenp = reclen;
 
 	if (uio == NULL)
 		return (0);
 
 	if (uio->uio_resid < reclen)
 		return (EINVAL);
 
 	dirent.d_fileno = fileno;
 	dirent.d_reclen = reclen;
 	dirent.d_type = DT_DIR;
 	dirent.d_namlen = namlen;
 	memcpy(dirent.d_name, name, namlen);
 	dirent_terminate(&dirent);
 	error = uiomove(&dirent, reclen, uio);
 
 	return (error);
 }
 
 static size_t
 autofs_dirent_reclen(const char *name)
 {
 	size_t reclen;
 
 	(void)autofs_readdir_one(NULL, name, -1, &reclen);
 
 	return (reclen);
 }
 
 static int
 autofs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp, *newvp;
 	struct autofs_mount *amp;
 	struct autofs_node *anp, *child;
 	struct uio *uio;
 	size_t reclen, reclens;
 	ssize_t initial_resid;
 	int error;
 
 	vp = ap->a_vp;
 	amp = VFSTOAUTOFS(vp->v_mount);
 	anp = vp->v_data;
 	uio = ap->a_uio;
 	initial_resid = ap->a_uio->uio_resid;
 
 	KASSERT(vp->v_type == VDIR, ("!VDIR"));
 
 	if (autofs_cached(anp, NULL, 0) == false &&
 	    autofs_ignore_thread(curthread) == false) {
 		error = autofs_trigger_vn(vp, "", 0, &newvp);
 		if (error != 0)
 			return (error);
 
 		if (newvp != NULL) {
 			error = VOP_READDIR(newvp, ap->a_uio, ap->a_cred,
 			    ap->a_eofflag, ap->a_ncookies, ap->a_cookies);
 			vput(newvp);
 			return (error);
 		}
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = FALSE;
 
 	/*
 	 * Write out the directory entry for ".".  This is conditional
 	 * on the current offset into the directory; same applies to the
 	 * other two cases below.
 	 */
 	if (uio->uio_offset == 0) {
 		error = autofs_readdir_one(uio, ".", anp->an_fileno, &reclen);
 		if (error != 0)
 			goto out;
 	}
 	reclens = autofs_dirent_reclen(".");
 
 	/*
 	 * Write out the directory entry for "..".
 	 */
 	if (uio->uio_offset <= reclens) {
 		if (uio->uio_offset != reclens)
 			return (EINVAL);
 		if (anp->an_parent == NULL) {
 			error = autofs_readdir_one(uio, "..",
 			    anp->an_fileno, &reclen);
 		} else {
 			error = autofs_readdir_one(uio, "..",
 			    anp->an_parent->an_fileno, &reclen);
 		}
 		if (error != 0)
 			goto out;
 	}
 
 	reclens += autofs_dirent_reclen("..");
 
 	/*
 	 * Write out the directory entries for subdirectories.
 	 */
 	AUTOFS_SLOCK(amp);
 	RB_FOREACH(child, autofs_node_tree, &anp->an_children) {
 		/*
 		 * Check the offset to skip entries returned by previous
 		 * calls to getdents().
 		 */
 		if (uio->uio_offset > reclens) {
 			reclens += autofs_dirent_reclen(child->an_name);
 			continue;
 		}
 
 		/*
 		 * Prevent seeking into the middle of dirent.
 		 */
 		if (uio->uio_offset != reclens) {
 			AUTOFS_SUNLOCK(amp);
 			return (EINVAL);
 		}
 
 		error = autofs_readdir_one(uio, child->an_name,
 		    child->an_fileno, &reclen);
 		reclens += reclen;
 		if (error != 0) {
 			AUTOFS_SUNLOCK(amp);
 			goto out;
 		}
 	}
 	AUTOFS_SUNLOCK(amp);
 
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = TRUE;
 
 	return (0);
 
 out:
 	/*
 	 * Return error if the initial buffer was too small to do anything.
 	 */
 	if (uio->uio_resid == initial_resid)
 		return (error);
 
 	/*
 	 * Don't return an error if we managed to copy out some entries.
 	 */
 	if (uio->uio_resid < reclen)
 		return (0);
 
 	return (error);
 }
 
 static int
 autofs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct autofs_node *anp;
 
 	vp = ap->a_vp;
 	anp = vp->v_data;
 
 	/*
 	 * We do not free autofs_node here; instead we are
 	 * destroying them in autofs_node_delete().
 	 */
 	sx_xlock(&anp->an_vnode_lock);
 	anp->an_vnode = NULL;
 	vp->v_data = NULL;
 	sx_xunlock(&anp->an_vnode_lock);
 
 	return (0);
 }
 
 struct vop_vector autofs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		autofs_access,
 	.vop_lookup =		autofs_lookup,
 	.vop_create =		VOP_EOPNOTSUPP,
 	.vop_getattr =		autofs_getattr,
 	.vop_link =		VOP_EOPNOTSUPP,
 	.vop_mkdir =		autofs_mkdir,
 	.vop_mknod =		VOP_EOPNOTSUPP,
 	.vop_print =		autofs_print,
 	.vop_read =		VOP_EOPNOTSUPP,
 	.vop_readdir =		autofs_readdir,
 	.vop_remove =		VOP_EOPNOTSUPP,
 	.vop_rename =		VOP_EOPNOTSUPP,
 	.vop_rmdir =		VOP_EOPNOTSUPP,
 	.vop_setattr =		VOP_EOPNOTSUPP,
 	.vop_symlink =		VOP_EOPNOTSUPP,
 	.vop_write =		VOP_EOPNOTSUPP,
 	.vop_reclaim =		autofs_reclaim,
 };
+VFS_VOP_VECTOR_REGISTER(autofs_vnodeops);
 
 int
 autofs_node_new(struct autofs_node *parent, struct autofs_mount *amp,
     const char *name, int namelen, struct autofs_node **anpp)
 {
 	struct autofs_node *anp;
 
 	if (parent != NULL) {
 		AUTOFS_ASSERT_XLOCKED(parent->an_mount);
 
 		KASSERT(autofs_node_find(parent, name, namelen, NULL) == ENOENT,
 		    ("node \"%s\" already exists", name));
 	}
 
 	anp = uma_zalloc(autofs_node_zone, M_WAITOK | M_ZERO);
 	if (namelen >= 0)
 		anp->an_name = strndup(name, namelen, M_AUTOFS);
 	else
 		anp->an_name = strdup(name, M_AUTOFS);
 	anp->an_fileno = atomic_fetchadd_int(&amp->am_last_fileno, 1);
 	callout_init(&anp->an_callout, 1);
 	/*
 	 * The reason for SX_NOWITNESS here is that witness(4)
 	 * cannot tell vnodes apart, so the following perfectly
 	 * valid lock order...
 	 *
 	 * vnode lock A -> autofsvlk B -> vnode lock B
 	 *
 	 * ... gets reported as a LOR.
 	 */
 	sx_init_flags(&anp->an_vnode_lock, "autofsvlk", SX_NOWITNESS);
 	getnanotime(&anp->an_ctime);
 	anp->an_parent = parent;
 	anp->an_mount = amp;
 	if (parent != NULL)
 		RB_INSERT(autofs_node_tree, &parent->an_children, anp);
 	RB_INIT(&anp->an_children);
 
 	*anpp = anp;
 	return (0);
 }
 
 int
 autofs_node_find(struct autofs_node *parent, const char *name,
     int namelen, struct autofs_node **anpp)
 {
 	struct autofs_node *anp, find;
 	int error;
 
 	AUTOFS_ASSERT_LOCKED(parent->an_mount);
 
 	if (namelen >= 0)
 		find.an_name = strndup(name, namelen, M_AUTOFS);
 	else
 		find.an_name = strdup(name, M_AUTOFS);
 
 	anp = RB_FIND(autofs_node_tree, &parent->an_children, &find);
 	if (anp != NULL) {
 		error = 0;
 		if (anpp != NULL)
 			*anpp = anp;
 	} else {
 		error = ENOENT;
 	}
 
 	free(find.an_name, M_AUTOFS);
 
 	return (error);
 }
 
 void
 autofs_node_delete(struct autofs_node *anp)
 {
 	struct autofs_node *parent;
 
 	AUTOFS_ASSERT_XLOCKED(anp->an_mount);
 	KASSERT(RB_EMPTY(&anp->an_children), ("have children"));
 
 	callout_drain(&anp->an_callout);
 
 	parent = anp->an_parent;
 	if (parent != NULL)
 		RB_REMOVE(autofs_node_tree, &parent->an_children, anp);
 	sx_destroy(&anp->an_vnode_lock);
 	free(anp->an_name, M_AUTOFS);
 	uma_zfree(autofs_node_zone, anp);
 }
 
 int
 autofs_node_vn(struct autofs_node *anp, struct mount *mp, int flags,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	AUTOFS_ASSERT_UNLOCKED(anp->an_mount);
 
 	sx_xlock(&anp->an_vnode_lock);
 
 	vp = anp->an_vnode;
 	if (vp != NULL) {
 		error = vget(vp, flags | LK_RETRY, curthread);
 		if (error != 0) {
 			AUTOFS_WARN("vget failed with error %d", error);
 			sx_xunlock(&anp->an_vnode_lock);
 			return (error);
 		}
 		if (VN_IS_DOOMED(vp)) {
 			/*
 			 * We got forcibly unmounted.
 			 */
 			AUTOFS_DEBUG("doomed vnode");
 			sx_xunlock(&anp->an_vnode_lock);
 			vput(vp);
 
 			return (ENOENT);
 		}
 
 		*vpp = vp;
 		sx_xunlock(&anp->an_vnode_lock);
 		return (0);
 	}
 
 	error = getnewvnode("autofs", mp, &autofs_vnodeops, &vp);
 	if (error != 0) {
 		sx_xunlock(&anp->an_vnode_lock);
 		return (error);
 	}
 
 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0) {
 		sx_xunlock(&anp->an_vnode_lock);
 		vdrop(vp);
 		return (error);
 	}
 
 	vp->v_type = VDIR;
 	if (anp->an_parent == NULL)
 		vp->v_vflag |= VV_ROOT;
 	vp->v_data = anp;
 
 	VN_LOCK_ASHARE(vp);
 
 	error = insmntque(vp, mp);
 	if (error != 0) {
 		AUTOFS_DEBUG("insmntque() failed with error %d", error);
 		sx_xunlock(&anp->an_vnode_lock);
 		return (error);
 	}
 
 	KASSERT(anp->an_vnode == NULL, ("lost race"));
 	anp->an_vnode = vp;
 
 	sx_xunlock(&anp->an_vnode_lock);
 
 	*vpp = vp;
 	return (0);
 }
Index: head/sys/fs/cd9660/cd9660_vnops.c
===================================================================
--- head/sys/fs/cd9660/cd9660_vnops.c	(revision 355789)
+++ head/sys/fs/cd9660/cd9660_vnops.c	(revision 355790)
@@ -1,922 +1,924 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley
  * by Pace Willisson (pace@blitz.com).  The Rock Ridge Extension
  * Support code is derived from software contributed to Berkeley
  * by Atsushi Murai (amurai@spec.co.jp).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)cd9660_vnops.c	8.19 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/unistd.h>
 #include <sys/filio.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 #include <vm/uma.h>
 
 #include <fs/cd9660/iso.h>
 #include <fs/cd9660/cd9660_node.h>
 #include <fs/cd9660/iso_rrip.h>
 
 static vop_setattr_t	cd9660_setattr;
 static vop_open_t	cd9660_open;
 static vop_access_t	cd9660_access;
 static vop_getattr_t	cd9660_getattr;
 static vop_ioctl_t	cd9660_ioctl;
 static vop_pathconf_t	cd9660_pathconf;
 static vop_read_t	cd9660_read;
 struct isoreaddir;
 static int iso_uiodir(struct isoreaddir *idp, struct dirent *dp, off_t off);
 static int iso_shipdir(struct isoreaddir *idp);
 static vop_readdir_t	cd9660_readdir;
 static vop_readlink_t	cd9660_readlink;
 static vop_strategy_t	cd9660_strategy;
 static vop_vptofh_t	cd9660_vptofh;
 static vop_getpages_t	cd9660_getpages;
 
 /*
  * Setattr call. Only allowed for block and character special devices.
  */
 static int
 cd9660_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
 		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check mode permission on inode pointer. Mode is READ, WRITE or EXEC.
  * The mode is shifted to select the owner/group/other fields. The
  * super user is granted all permissions.
  */
 /* ARGSUSED */
 static int
 cd9660_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts unless the file is a socket,
 	 * fifo, or a block or character device resident on the
 	 * filesystem.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, ip->inode.iso_mode, ip->inode.iso_uid,
 	    ip->inode.iso_gid, ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 cd9660_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		struct file *a_fp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip = VTOI(vp);
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	vnode_create_vobject(vp, ip->i_size, ap->a_td);
 	return (0);
 }
 
 
 static int
 cd9660_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct iso_node *ip = VTOI(vp);
 
 	vap->va_fsid    = dev2udev(ip->i_mnt->im_dev);
 	vap->va_fileid	= ip->i_number;
 
 	vap->va_mode	= ip->inode.iso_mode;
 	vap->va_nlink	= ip->inode.iso_links;
 	vap->va_uid	= ip->inode.iso_uid;
 	vap->va_gid	= ip->inode.iso_gid;
 	vap->va_atime	= ip->inode.iso_atime;
 	vap->va_mtime	= ip->inode.iso_mtime;
 	vap->va_ctime	= ip->inode.iso_ctime;
 	vap->va_rdev	= ip->inode.iso_rdev;
 
 	vap->va_size	= (u_quad_t) ip->i_size;
 	if (ip->i_size == 0 && (vap->va_mode & S_IFMT) == S_IFLNK) {
 		struct vop_readlink_args rdlnk;
 		struct iovec aiov;
 		struct uio auio;
 		char *cp;
 
 		cp = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = curthread;
 		auio.uio_resid = MAXPATHLEN;
 		rdlnk.a_uio = &auio;
 		rdlnk.a_vp = ap->a_vp;
 		rdlnk.a_cred = ap->a_cred;
 		if (cd9660_readlink(&rdlnk) == 0)
 			vap->va_size = MAXPATHLEN - auio.uio_resid;
 		free(cp, M_TEMP);
 	}
 	vap->va_flags	= 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = ip->i_mnt->logical_block_size;
 	vap->va_bytes	= (u_quad_t) ip->i_size;
 	vap->va_type	= vp->v_type;
 	vap->va_filerev	= 0;
 	return (0);
 }
 
 /*
  * Vnode op for ioctl.
  */
 static int
 cd9660_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct iso_node *ip;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		VOP_UNLOCK(vp, 0);
 		return (EOPNOTSUPP);
 	}
 
 	ip = VTOI(vp);
 	error = 0;
 
 	switch (ap->a_command) {
 	case FIOGETLBA:
 		*(int *)(ap->a_data) = ip->iso_start;
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 cd9660_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct iso_node *ip = VTOI(vp);
 	struct iso_mnt *imp;
 	struct buf *bp;
 	daddr_t lbn, rablock;
 	off_t diff;
 	int rasize, error = 0;
 	int seqcount;
 	long size, n, on;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	imp = ip->i_mnt;
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
 		n = MIN(imp->logical_block_size - on, uio->uio_resid);
 		diff = (off_t)ip->i_size - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = blksize(imp, ip, lbn);
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(imp, rablock) < ip->i_size)
 				error = cluster_read(vp, (off_t)ip->i_size,
 					 lbn, size, NOCRED, uio->uio_resid,
 					 (ap->a_ioflag >> 16), 0, &bp);
 			else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		} else {
 			if (seqcount > 1 &&
 			    lblktosize(imp, rablock) < ip->i_size) {
 				rasize = blksize(imp, ip, rablock);
 				error = breadn(vp, lbn, size, &rablock,
 					       &rasize, 1, NOCRED, &bp);
 			} else
 				error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error != 0)
 			return (error);
 		n = MIN(n, size - bp->b_resid);
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Structure for reading directories
  */
 struct isoreaddir {
 	struct dirent saveent;
 	struct dirent assocent;
 	struct dirent current;
 	off_t saveoff;
 	off_t assocoff;
 	off_t curroff;
 	struct uio *uio;
 	off_t uio_off;
 	int eofflag;
 	u_long *cookies;
 	int ncookies;
 };
 
 static int
 iso_uiodir(idp,dp,off)
 	struct isoreaddir *idp;
 	struct dirent *dp;
 	off_t off;
 {
 	int error;
 
 	dp->d_reclen = GENERIC_DIRSIZ(dp);
 	dirent_terminate(dp);
 
 	if (idp->uio->uio_resid < dp->d_reclen) {
 		idp->eofflag = 0;
 		return (-1);
 	}
 
 	if (idp->cookies) {
 		if (idp->ncookies <= 0) {
 			idp->eofflag = 0;
 			return (-1);
 		}
 
 		*idp->cookies++ = off;
 		--idp->ncookies;
 	}
 
 	if ((error = uiomove(dp, dp->d_reclen, idp->uio)) != 0)
 		return (error);
 	idp->uio_off = off;
 	return (0);
 }
 
 static int
 iso_shipdir(idp)
 	struct isoreaddir *idp;
 {
 	struct dirent *dp;
 	int cl, sl, assoc;
 	int error;
 	char *cname, *sname;
 
 	cl = idp->current.d_namlen;
 	cname = idp->current.d_name;
 	assoc = (cl > 1) && (*cname == ASSOCCHAR);
 	if (assoc) {
 		cl--;
 		cname++;
 	}
 
 	dp = &idp->saveent;
 	sname = dp->d_name;
 	if (!(sl = dp->d_namlen)) {
 		dp = &idp->assocent;
 		sname = dp->d_name + 1;
 		sl = dp->d_namlen - 1;
 	}
 	if (sl > 0) {
 		if (sl != cl
 		    || bcmp(sname,cname,sl)) {
 			if (idp->assocent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->assocent,idp->assocoff)) != 0)
 					return (error);
 				idp->assocent.d_namlen = 0;
 			}
 			if (idp->saveent.d_namlen) {
 				if ((error = iso_uiodir(idp,&idp->saveent,idp->saveoff)) != 0)
 					return (error);
 				idp->saveent.d_namlen = 0;
 			}
 		}
 	}
 	idp->current.d_reclen = GENERIC_DIRSIZ(&idp->current);
 	if (assoc) {
 		idp->assocoff = idp->curroff;
 		memcpy(&idp->assocent, &idp->current, idp->current.d_reclen);
 	} else {
 		idp->saveoff = idp->curroff;
 		memcpy(&idp->saveent, &idp->current, idp->current.d_reclen);
 	}
 	return (0);
 }
 
 /*
  * Vnode op for readdir
  */
 static int
 cd9660_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct isoreaddir *idp;
 	struct vnode *vdp = ap->a_vp;
 	struct iso_node *dp;
 	struct iso_mnt *imp;
 	struct buf *bp = NULL;
 	struct iso_directory_record *ep;
 	int entryoffsetinblock;
 	doff_t endsearch;
 	u_long bmask;
 	int error = 0;
 	int reclen;
 	u_short namelen;
 	u_int ncookies = 0;
 	u_long *cookies = NULL;
 	cd_ino_t ino;
 
 	dp = VTOI(vdp);
 	imp = dp->i_mnt;
 	bmask = imp->im_bmask;
 
 	idp = malloc(sizeof(*idp), M_TEMP, M_WAITOK);
 	idp->saveent.d_namlen = idp->assocent.d_namlen = 0;
 	/*
 	 * XXX
 	 * Is it worth trying to figure out the type?
 	 */
 	idp->saveent.d_type = idp->assocent.d_type = idp->current.d_type =
 	    DT_UNKNOWN;
 	idp->uio = uio;
 	if (ap->a_ncookies == NULL) {
 		idp->cookies = NULL;
 	} else {
 		/*
 		 * Guess the number of cookies needed.
 		 */
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		idp->cookies = cookies;
 		idp->ncookies = ncookies;
 	}
 	idp->eofflag = 1;
 	idp->curroff = uio->uio_offset;
 	idp->uio_off = uio->uio_offset;
 
 	if ((entryoffsetinblock = idp->curroff & bmask) &&
 	    (error = cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp))) {
 		free(idp, M_TEMP);
 		return (error);
 	}
 	endsearch = dp->i_size;
 
 	while (idp->curroff < endsearch) {
 		/*
 		 * If offset is on a block boundary,
 		 * read the next directory block.
 		 * Release previous if it exists.
 		 */
 		if ((idp->curroff & bmask) == 0) {
 			if (bp != NULL)
 				brelse(bp);
 			if ((error =
 			    cd9660_blkatoff(vdp, (off_t)idp->curroff, NULL, &bp)) != 0)
 				break;
 			entryoffsetinblock = 0;
 		}
 		/*
 		 * Get pointer to next entry.
 		 */
 		ep = (struct iso_directory_record *)
 			((char *)bp->b_data + entryoffsetinblock);
 
 		reclen = isonum_711(ep->length);
 		if (reclen == 0) {
 			/* skip to next block, if any */
 			idp->curroff =
 			    (idp->curroff & ~bmask) + imp->logical_block_size;
 			continue;
 		}
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (entryoffsetinblock + reclen > imp->logical_block_size) {
 			error = EINVAL;
 			/* illegal directory, so stop looking */
 			break;
 		}
 
 		idp->current.d_namlen = isonum_711(ep->name_len);
 
 		if (reclen < ISO_DIRECTORY_RECORD_SIZE + idp->current.d_namlen) {
 			error = EINVAL;
 			/* illegal entry, stop */
 			break;
 		}
 
 		if (isonum_711(ep->flags)&2)
 			idp->current.d_fileno = isodirino(ep, imp);
 		else
 			idp->current.d_fileno = dbtob(bp->b_blkno) +
 				entryoffsetinblock;
 
 		idp->curroff += reclen;
 		/* NOTE: d_off is the offset of *next* entry. */
 		idp->current.d_off = idp->curroff;
 
 		switch (imp->iso_ftype) {
 		case ISO_FTYPE_RRIP:
 			ino = idp->current.d_fileno;
 			cd9660_rrip_getname(ep, idp->current.d_name, &namelen,
 			    &ino, imp);
 			idp->current.d_fileno = ino;
 			idp->current.d_namlen = (u_char)namelen;
 			if (idp->current.d_namlen)
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			break;
 		default: /* ISO_FTYPE_DEFAULT || ISO_FTYPE_9660 || ISO_FTYPE_HIGH_SIERRA*/
 			strcpy(idp->current.d_name,"..");
 			if (idp->current.d_namlen == 1 && ep->name[0] == 0) {
 				idp->current.d_namlen = 1;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else if (idp->current.d_namlen == 1 && ep->name[0] == 1) {
 				idp->current.d_namlen = 2;
 				error = iso_uiodir(idp,&idp->current,idp->curroff);
 			} else {
 				isofntrans(ep->name,idp->current.d_namlen,
 					   idp->current.d_name, &namelen,
 					   imp->iso_ftype == ISO_FTYPE_9660,
 					   isonum_711(ep->flags)&4,
 					   imp->joliet_level,
 					   imp->im_flags,
 					   imp->im_d2l);
 				idp->current.d_namlen = (u_char)namelen;
 				if (imp->iso_ftype == ISO_FTYPE_DEFAULT)
 					error = iso_shipdir(idp);
 				else
 					error = iso_uiodir(idp,&idp->current,idp->curroff);
 			}
 		}
 		if (error)
 			break;
 
 		entryoffsetinblock += reclen;
 	}
 
 	if (!error && imp->iso_ftype == ISO_FTYPE_DEFAULT) {
 		idp->current.d_namlen = 0;
 		error = iso_shipdir(idp);
 	}
 	if (error < 0)
 		error = 0;
 
 	if (ap->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			/*
 			 * Work out the number of cookies actually used.
 			 */
 			*ap->a_ncookies = ncookies - idp->ncookies;
 			*ap->a_cookies = cookies;
 		}
 	}
 
 	if (bp)
 		brelse (bp);
 
 	uio->uio_offset = idp->uio_off;
 	*ap->a_eofflag = idp->eofflag;
 
 	free(idp, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  * Shouldn't we get the parent vnode and read the data from there?
  * This could eventually result in deadlocks in cd9660_lookup.
  * But otherwise the block read here is in the block buffer two times.
  */
 typedef struct iso_directory_record ISODIR;
 typedef struct iso_node		    ISONODE;
 typedef struct iso_mnt		    ISOMNT;
 static int
 cd9660_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	ISONODE	*ip;
 	ISODIR	*dirp;
 	ISOMNT	*imp;
 	struct	buf *bp;
 	struct	uio *uio;
 	u_short	symlen;
 	int	error;
 	char	*symname;
 
 	ip  = VTOI(ap->a_vp);
 	imp = ip->i_mnt;
 	uio = ap->a_uio;
 
 	if (imp->iso_ftype != ISO_FTYPE_RRIP)
 		return (EINVAL);
 
 	/*
 	 * Get parents directory record block that this inode included.
 	 */
 	error = bread(imp->im_devvp,
 		      (ip->i_number >> imp->im_bshift) <<
 		      (imp->im_bshift - DEV_BSHIFT),
 		      imp->logical_block_size, NOCRED, &bp);
 	if (error) {
 		return (EINVAL);
 	}
 
 	/*
 	 * Setup the directory pointer for this inode
 	 */
 	dirp = (ISODIR *)(bp->b_data + (ip->i_number & imp->im_bmask));
 
 	/*
 	 * Just make sure, we have a right one....
 	 *   1: Check not cross boundary on block
 	 */
 	if ((ip->i_number & imp->im_bmask) + isonum_711(dirp->length)
 	    > (unsigned)imp->logical_block_size) {
 		brelse(bp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Now get a buffer
 	 * Abuse a namei buffer for now.
 	 */
 	if (uio->uio_segflg == UIO_SYSSPACE)
 		symname = uio->uio_iov->iov_base;
 	else
 		symname = uma_zalloc(namei_zone, M_WAITOK);
 
 	/*
 	 * Ok, we just gathering a symbolic name in SL record.
 	 */
 	if (cd9660_rrip_getsymname(dirp, symname, &symlen, imp) == 0) {
 		if (uio->uio_segflg != UIO_SYSSPACE)
 			uma_zfree(namei_zone, symname);
 		brelse(bp);
 		return (EINVAL);
 	}
 	/*
 	 * Don't forget before you leave from home ;-)
 	 */
 	brelse(bp);
 
 	/*
 	 * return with the symbolic name to caller's.
 	 */
 	if (uio->uio_segflg != UIO_SYSSPACE) {
 		error = uiomove(symname, symlen, uio);
 		uma_zfree(namei_zone, symname);
 		return (error);
 	}
 	uio->uio_resid -= symlen;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + symlen;
 	uio->uio_iov->iov_len -= symlen;
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 static int
 cd9660_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct iso_node *ip;
 	struct bufobj *bo;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("cd9660_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		bp->b_blkno = (ip->iso_start + bp->b_lblkno) <<
 		    (ip->i_mnt->im_bshift - DEV_BSHIFT);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = ip->i_mnt->im_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to cd9660 filesystems.
  */
 static int
 cd9660_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		register_t *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP)
 			*ap->a_retval = NAME_MAX;
 		else
 			*ap->a_retval = 37;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		if (VTOI(ap->a_vp)->i_mnt->iso_ftype == ISO_FTYPE_RRIP) {
 			*ap->a_retval = MAXPATHLEN;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Vnode pointer to File handle
  */
 static int
 cd9660_vptofh(ap)
 	struct vop_vptofh_args /* {
 		struct vnode *a_vp;
 		struct fid *a_fhp;
 	} */ *ap;
 {
 	struct ifid ifh;
 	struct iso_node *ip = VTOI(ap->a_vp);
 
 	ifh.ifid_len = sizeof(struct ifid);
 
 	ifh.ifid_ino = ip->i_number;
 	ifh.ifid_start = ip->iso_start;
 	/*
 	 * This intentionally uses sizeof(ifh) in order to not copy stack
 	 * garbage on ILP32.
 	 */
 	memcpy(ap->a_fhp, &ifh, sizeof(ifh));
 
 #ifdef	ISOFS_DBG
 	printf("vptofh: ino %jd, start %ld\n",
 	    (uintmax_t)ifh.ifid_ino, ifh.ifid_start);
 #endif
 
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, cd9660, CTLFLAG_RW, 0, "cd9660 filesystem");
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_cd9660, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 cd9660_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (lblkno(VTOI(vp)->i_mnt, off));
 }
 
 static int
 cd9660_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	struct iso_node *ip;
 
 	ip = VTOI(vp);
 	return (blksize(ip->i_mnt, ip, lbn));
 }
 
 static int
 cd9660_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, cd9660_gbp_getblkno,
 		    cd9660_gbp_getblksz));
 	return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 /*
  * Global vfs data structures for cd9660
  */
 struct vop_vector cd9660_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_open =		cd9660_open,
 	.vop_access =		cd9660_access,
 	.vop_bmap =		cd9660_bmap,
 	.vop_cachedlookup =	cd9660_lookup,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_ioctl =		cd9660_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_pathconf =		cd9660_pathconf,
 	.vop_read =		cd9660_read,
 	.vop_readdir =		cd9660_readdir,
 	.vop_readlink =		cd9660_readlink,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_strategy =		cd9660_strategy,
 	.vop_vptofh =		cd9660_vptofh,
 	.vop_getpages =		cd9660_getpages,
 };
+VFS_VOP_VECTOR_REGISTER(cd9660_vnodeops);
 
 /*
  * Special device vnode ops
  */
 
 struct vop_vector cd9660_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		cd9660_access,
 	.vop_getattr =		cd9660_getattr,
 	.vop_inactive =		cd9660_inactive,
 	.vop_reclaim =		cd9660_reclaim,
 	.vop_setattr =		cd9660_setattr,
 	.vop_vptofh =		cd9660_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(cd9660_fifoops);
Index: head/sys/fs/deadfs/dead_vnops.c
===================================================================
--- head/sys/fs/deadfs/dead_vnops.c	(revision 355789)
+++ head/sys/fs/deadfs/dead_vnops.c	(revision 355790)
@@ -1,159 +1,160 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)dead_vnops.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/vnode.h>
 
 /*
  * Prototypes for dead operations on vnodes.
  */
 static vop_lookup_t	dead_lookup;
 static vop_open_t	dead_open;
 static vop_getwritemount_t dead_getwritemount;
 static vop_rename_t	dead_rename;
 static vop_unset_text_t	dead_unset_text;
 
 struct vop_vector dead_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_EBADF,
 	.vop_advlock =		VOP_EBADF,
 	.vop_bmap =		VOP_EBADF,
 	.vop_create =		VOP_PANIC,
 	.vop_getattr =		VOP_EBADF,
 	.vop_getwritemount =	dead_getwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_ioctl =		VOP_EBADF,
 	.vop_link =		VOP_PANIC,
 	.vop_lookup =		dead_lookup,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		dead_open,
 	.vop_pathconf =		VOP_EBADF,	/* per pathconf(2) */
 	.vop_poll =		dead_poll,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_EBADF,
 	.vop_readlink =		VOP_EBADF,
 	.vop_reclaim =		VOP_NULL,
 	.vop_remove =		VOP_PANIC,
 	.vop_rename =		dead_rename,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		VOP_EBADF,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		VOP_EBADF,
 	.vop_unset_text =	dead_unset_text,
 	.vop_write =		dead_write,
 };
+VFS_VOP_VECTOR_REGISTER(dead_vnodeops);
 
 static int
 dead_getwritemount(struct vop_getwritemount_args *ap)
 {
 
 	*(ap->a_mpp) = NULL;
 	return (0);
 }
 
 /*
  * Trivial lookup routine that always fails.
  */
 static int
 dead_lookup(struct vop_lookup_args *ap)
 {
 
 	*ap->a_vpp = NULL;
 	return (ENOTDIR);
 }
 
 /*
  * Open always fails as if device did not exist.
  */
 static int
 dead_open(struct vop_open_args *ap)
 {
 
 	return (ENXIO);
 }
 
 int
 dead_read(struct vop_read_args *ap)
 {
 
 	/*
 	 * Return EOF for tty devices, EIO for others
 	 */
 	if ((ap->a_vp->v_vflag & VV_ISTTY) == 0)
 		return (EIO);
 	return (0);
 }
 
 int
 dead_write(struct vop_write_args *ap)
 {
 
 	return (EIO);
 }
 
 int
 dead_poll(struct vop_poll_args *ap)
 {
 
 	if (ap->a_events & ~POLLSTANDARD)
 		return (POLLNVAL);
 
 	/*
 	 * Let the user find out that the descriptor is gone.
 	 */
 	return (POLLHUP | ((POLLIN | POLLRDNORM) & ap->a_events));
 
 }
 
 static int
 dead_rename(struct vop_rename_args *ap)
 {
 
 	vop_rename_fail(ap);
 	return (EXDEV);
 }
 
 static int
 dead_unset_text(struct vop_unset_text_args *ap)
 {
 
 	return (0);
 }
Index: head/sys/fs/devfs/devfs_vnops.c
===================================================================
--- head/sys/fs/devfs/devfs_vnops.c	(revision 355789)
+++ head/sys/fs/devfs/devfs_vnops.c	(revision 355790)
@@ -1,1980 +1,1982 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	KASSERT(p->cdpd_fp->f_cdevpriv == p,
 	    ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p));
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 static void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp, 0);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ERESTART);
 	}
 	if (VN_IS_DOOMED(vp)) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	if (vp->v_type != VCHR && vp->v_type != VDIR) {
 		error = ENOENT;
 		goto finished;
 	}
 
 	dd = vp->v_data;
 	if (vp->v_type == VDIR && dd == dmp->dm_rootdir) {
 		*dvp = vp;
 		vref(*dvp);
 		goto finished;
 	}
 
 	i = *buflen;
 	i -= dd->de_dirent->d_namlen;
 	if (i < 0) {
 		error = ENOMEM;
 		goto finished;
 	}
 	bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen);
 	*buflen = i;
 	de = devfs_parent_dirent(dd);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		}
 		else if (VN_IS_DOOMED(vp)) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		KASSERT(vp->v_usecount == 1,
 		    ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount));
 		dev->si_usecount++;
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0,
     "devfs-only flag reuse failed");
 
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int dflags, error, ref, vp_locked;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque1() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (vp->v_usecount == 2 && td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				VI_LOCK(vp);
 				if (vp->v_usecount == 2 && vcount(vp) == 1 &&
 				    !VN_IS_DOOMED(vp)) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				vrele(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	dflags = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 1 && vcount(vp) == 1)
 		dflags |= FLASTCLOSE;
 	if (VN_IS_DOOMED(vp)) {
 		/* Forced close. */
 		dflags |= FREVOKE | FNONBLOCK;
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if ((dflags & FLASTCLOSE) == 0) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	vholdnz(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 	struct timeval boottime;
 	int error;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 	getboottime(&boottime);
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct file *fpop;
 	int error;
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	error = vnops.fo_ioctl(fp, com, data, cred, td);
 	td->td_fpop = fpop;
 	return (error);
 }
 
 void *
 fiodgname_buf_get_ptr(void *fgnp, u_long com)
 {
 	union {
 		struct fiodgname_arg	fgn;
 #ifdef COMPAT_FREEBSD32
 		struct fiodgname_arg32	fgn32;
 #endif
 	} *fgnup;
 
 	fgnup = fgnp;
 	switch (com) {
 	case FIODGNAME:
 		return (fgnup->fgn.buf);
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 		return ((void *)(uintptr_t)fgnup->fgn32.buf);
 #endif
 	default:
 		panic("Unhandled ioctl command %ld", com);
 	}
 }
 
 static int
 devfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct fiodgname_arg *fgn;
 	struct vnode *vpold, *vp;
 	struct cdevsw *dsw;
 	struct thread *td;
 	struct cdev *dev;
 	int error, ref, i;
 	const char *p;
 	u_long com;
 
 	vp = ap->a_vp;
 	com = ap->a_command;
 	td = ap->a_td;
 
 	dsw = devvn_refthread(vp, &dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(dev)));
 
 	switch (com) {
 	case FIODTYPE:
 		*(int *)ap->a_data = dsw->d_flags & D_TYPEMASK;
 		error = 0;
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		fgn = ap->a_data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i);
 		break;
 	default:
 		error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td);
 	}
 
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	if (error == 0 && com == TIOCSCTTY) {
 		/*
 		 * Do nothing if reassigning same control tty, or if the
 		 * control tty has already disappeared.  If it disappeared,
 		 * it's because we were racing with TIOCNOTTY.  TIOCNOTTY
 		 * already took care of releasing the old vnode and we have
 		 * nothing left to do.
 		 */
 		sx_slock(&proctree_lock);
 		if (td->td_proc->p_session->s_ttyvp == vp ||
 		    td->td_proc->p_session->s_ttyp == NULL) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		vpold = td->td_proc->p_session->s_ttyvp;
 		VREF(vp);
 		SESS_LOCK(td->td_proc->p_session);
 		td->td_proc->p_session->s_ttyvp = vp;
 		td->td_proc->p_session->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(td->td_proc->p_session);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			vrele(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct mount *mp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	mp = dvp->v_mount;
 	dmp = VFSTODEVFS(mp);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 		error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK,
 		    vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* Clean up any cdevpriv upon error. */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = INT_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_MAX_CANON:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAX_INPUT:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_VDISABLE:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp));
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		/* NOTE: d_off is the offset for the *next* entry. */
 		dp->d_off = off + dp->d_reclen;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	mtx_lock(&devfs_de_interlock);
 	de = vp->v_data;
 	if (de != NULL) {
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 	mtx_unlock(&devfs_de_interlock);
 	return (0);
 }
 
 static int
 devfs_reclaim_vchr(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct cdev *dev;
 
 	vp = ap->a_vp;
 	MPASS(vp->v_type == VCHR);
 
 	devfs_reclaim(ap);
 
 	VI_LOCK(vp);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 	if (dev != NULL)
 		dev->si_usecount -= (vp->v_usecount > 0);
 	dev_unlock();
 	VI_UNLOCK(vp);
 	if (dev != NULL)
 		dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp, 0);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp, 0);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	u_int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
  
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp,0);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				VI_LOCK(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK,
 				    curthread))
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			} 
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp, 0);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			goto ret;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 
 ret:
 	sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock);
 	return (error);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fpop;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error, ref;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 * The one exception is that D_MMAP_ANON devices
 	 * (i.e. /dev/zero) permit private writable mappings.
 	 *
 	 * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests
 	 * as well as updating maxprot to permit writing for
 	 * D_MMAP_ANON devices rather than doing that here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
 	    &object);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap =	devfs_mmap_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 /* Vops for non-CHR vnodes in /dev. */
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 };
+VFS_VOP_VECTOR_REGISTER(devfs_vnodeops);
 
 /* Vops for VCHR vnodes in /dev. */
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		vop_stdfsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_ioctl,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim_vchr,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 };
+VFS_VOP_VECTOR_REGISTER(devfs_specops);
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
Index: head/sys/fs/ext2fs/ext2_vnops.c
===================================================================
--- head/sys/fs/ext2fs/ext2_vnops.c	(revision 355789)
+++ head/sys/fs/ext2fs/ext2_vnops.c	(revision 355790)
@@ -1,2347 +1,2349 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
  *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include "opt_suiddir.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/endian.h>
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/lockf.h>
 #include <sys/event.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/extattr.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include "opt_directio.h"
 
 #include <ufs/ufs/dir.h>
 
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/inode.h>
 #include <fs/ext2fs/ext2_acl.h>
 #include <fs/ext2fs/ext2fs.h>
 #include <fs/ext2fs/ext2_extern.h>
 #include <fs/ext2fs/ext2_dinode.h>
 #include <fs/ext2fs/ext2_dir.h>
 #include <fs/ext2fs/ext2_mount.h>
 #include <fs/ext2fs/ext2_extattr.h>
 #include <fs/ext2fs/ext2_extents.h>
 
 SDT_PROVIDER_DECLARE(ext2fs);
 /*
  * ext2fs trace probe:
  * arg0: verbosity. Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*");
 
 static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *);
 static void ext2_itimes_locked(struct vnode *);
 
 static vop_access_t	ext2_access;
 static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *,
     struct thread *);
 static vop_close_t	ext2_close;
 static vop_create_t	ext2_create;
 static vop_fsync_t	ext2_fsync;
 static vop_getattr_t	ext2_getattr;
 static vop_ioctl_t	ext2_ioctl;
 static vop_link_t	ext2_link;
 static vop_mkdir_t	ext2_mkdir;
 static vop_mknod_t	ext2_mknod;
 static vop_open_t	ext2_open;
 static vop_pathconf_t	ext2_pathconf;
 static vop_print_t	ext2_print;
 static vop_read_t	ext2_read;
 static vop_readlink_t	ext2_readlink;
 static vop_remove_t	ext2_remove;
 static vop_rename_t	ext2_rename;
 static vop_rmdir_t	ext2_rmdir;
 static vop_setattr_t	ext2_setattr;
 static vop_strategy_t	ext2_strategy;
 static vop_symlink_t	ext2_symlink;
 static vop_write_t	ext2_write;
 static vop_deleteextattr_t	ext2_deleteextattr;
 static vop_getextattr_t	ext2_getextattr;
 static vop_listextattr_t	ext2_listextattr;
 static vop_setextattr_t	ext2_setextattr;
 static vop_vptofh_t	ext2_vptofh;
 static vop_close_t	ext2fifo_close;
 static vop_kqfilter_t	ext2fifo_kqfilter;
 
 /* Global vfs data structures for ext2. */
 struct vop_vector ext2_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		ext2_access,
 	.vop_bmap =		ext2_bmap,
 	.vop_cachedlookup =	ext2_lookup,
 	.vop_close =		ext2_close,
 	.vop_create =		ext2_create,
 	.vop_fsync =		ext2_fsync,
 	.vop_getpages =		vnode_pager_local_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_ioctl =		ext2_ioctl,
 	.vop_link =		ext2_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		ext2_mkdir,
 	.vop_mknod =		ext2_mknod,
 	.vop_open =		ext2_open,
 	.vop_pathconf =		ext2_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ext2_print,
 	.vop_read =		ext2_read,
 	.vop_readdir =		ext2_readdir,
 	.vop_readlink =		ext2_readlink,
 	.vop_reallocblks =	ext2_reallocblks,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_remove =		ext2_remove,
 	.vop_rename =		ext2_rename,
 	.vop_rmdir =		ext2_rmdir,
 	.vop_setattr =		ext2_setattr,
 	.vop_strategy =		ext2_strategy,
 	.vop_symlink =		ext2_symlink,
 	.vop_write =		ext2_write,
 	.vop_deleteextattr =	ext2_deleteextattr,
 	.vop_getextattr =	ext2_getextattr,
 	.vop_listextattr =	ext2_listextattr,
 	.vop_setextattr =	ext2_setextattr,
 #ifdef UFS_ACL
 	.vop_getacl =		ext2_getacl,
 	.vop_setacl =		ext2_setacl,
 	.vop_aclcheck =		ext2_aclcheck,
 #endif /* UFS_ACL */
 	.vop_vptofh =		ext2_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ext2_vnodeops);
 
 struct vop_vector ext2_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		ext2_access,
 	.vop_close =		ext2fifo_close,
 	.vop_fsync =		ext2_fsync,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_kqfilter =		ext2fifo_kqfilter,
 	.vop_pathconf =		ext2_pathconf,
 	.vop_print =		ext2_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_setattr =		ext2_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_vptofh =		ext2_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ext2_fifoops);
 
 /*
  * A virgin directory (no blushing please).
  * Note that the type and namlen fields are reversed relative to ext2.
  * Also, we don't use `struct odirtemplate', since it would just cause
  * endianness problems.
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, 1, EXT2_FT_DIR, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".."
 };
 static struct dirtemplate omastertemplate = {
 	0, 12, 1, EXT2_FT_UNKNOWN, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".."
 };
 
 static void
 ext2_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 	if ((vp->v_type == VBLK || vp->v_type == VCHR))
 		ip->i_flag |= IN_LAZYMOD;
 	else
 		ip->i_flag |= IN_MODIFIED;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		vfs_timestamp(&ts);
 		if (ip->i_flag & IN_ACCESS) {
 			ip->i_atime = ts.tv_sec;
 			ip->i_atimensec = ts.tv_nsec;
 		}
 		if (ip->i_flag & IN_UPDATE) {
 			ip->i_mtime = ts.tv_sec;
 			ip->i_mtimensec = ts.tv_nsec;
 			ip->i_modrev++;
 		}
 		if (ip->i_flag & IN_CHANGE) {
 			ip->i_ctime = ts.tv_sec;
 			ip->i_ctimensec = ts.tv_nsec;
 		}
 	}
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ext2_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ext2_create(struct vop_create_args *ap)
 {
 	int error;
 
 	error =
 	    ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
 	if (error != 0)
 		return (error);
 	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
 	return (0);
 }
 
 static int
 ext2_open(struct vop_open_args *ap)
 {
 
 	if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 
 	vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td);
 
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 static int
 ext2_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ext2_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts on read-only file systems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the file system.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	/* If immutable bit set, nobody gets to write it. */
 	if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 	error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 static int
 ext2_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	ext2_itimes(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ip->i_devvp->v_rdev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_nlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	vap->va_rdev = ip->i_rdev;
 	vap->va_size = ip->i_size;
 	vap->va_atime.tv_sec = ip->i_atime;
 	vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0;
 	vap->va_mtime.tv_sec = ip->i_mtime;
 	vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0;
 	vap->va_ctime.tv_sec = ip->i_ctime;
 	vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0;
 	if E2DI_HAS_XTIME(ip) {
 		vap->va_birthtime.tv_sec = ip->i_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_birthnsec;
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_bytes = dbtob((u_quad_t)ip->i_blocks);
 	vap->va_type = IFTOVT(ip->i_mode);
 	vap->va_filerev = ip->i_modrev;
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ext2_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		/* Disallow flags not supported by ext2fs. */
 		if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP))
 			return (EOPNOTSUPP);
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes and privileged processes in
 		 * jail() are not permitted to unset system flags, or
 		 * modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 		} else {
 			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) ||
 			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
 				return (EPERM);
 		}
 		ip->i_flags = vap->va_flags;
 		ip->i_flag |= IN_CHANGE;
 		if (ip->i_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * Disallow write attempts on read-only file systems;
 		 * unless the file is a socket, fifo, or a block or
 		 * character device resident on the file system.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 		if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * From utimes(2):
 		 * If times is NULL, ... The caller must be the owner of
 		 * the file, have permission to write the file, or be the
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, cred, td))))
 			return (error);
 		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_ACCESS;
 			ip->i_atime = vap->va_atime.tv_sec;
 			ip->i_atimensec = vap->va_atime.tv_nsec;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_UPDATE;
 			ip->i_mtime = vap->va_mtime.tv_sec;
 			ip->i_mtimensec = vap->va_mtime.tv_nsec;
 		}
 		ip->i_birthtime = vap->va_birthtime.tv_sec;
 		ip->i_birthnsec = vap->va_birthtime.tv_nsec;
 		error = ext2_update(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		error = priv_check_cred(cred, PRIV_VFS_STICKYFILE);
 		if (error)
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
 	return (0);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *td)
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file
 	 * to a group of which we are not a member, the caller must
 	 * have privilege.
 	 */
 	if (uid != ip->i_uid || (gid != ip->i_gid &&
 	    !groupmember(gid, cred))) {
 		error = priv_check_cred(cred, PRIV_VFS_CHOWN);
 		if (error)
 			return (error);
 	}
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 	ip->i_gid = gid;
 	ip->i_uid = uid;
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0)
 			ip->i_mode &= ~(ISUID | ISGID);
 	}
 	return (0);
 }
 
 /*
  * Synch an open file.
  */
 /* ARGSUSED */
 static int
 ext2_fsync(struct vop_fsync_args *ap)
 {
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 
 	vop_stdfsync(ap);
 
 	return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT));
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ext2_mknod(struct vop_mknod_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		if (!(ip->i_flag & IN_E4EXTENTS))
 			ip->i_rdev = vap->va_rdev;
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.	 XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 static int
 ext2_remove(struct vop_remove_args *ap)
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	error = ext2_dirremove(dvp, ap->a_cnp);
 	if (error == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 /*
  * link vnode call
  */
 static int
 ext2_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_link: no name");
 #endif
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	error = ext2_update(vp, !DOINGASYNC(vp));
 	if (!error)
 		error = ext2_direnter(ip, tdvp, cnp);
 	if (error) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 static int
 ext2_inc_nlink(struct inode *ip)
 {
 
 	ip->i_nlink++;
 
 	if (S_ISDIR(ip->i_mode) &&
 	    EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) &&
 	    ip->i_nlink > 1) {
 		if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2)
 			ip->i_nlink = 1;
 	} else if (ip->i_nlink > EXT4_LINK_MAX) {
 		ip->i_nlink--;
 		return (EMLINK);
 	}
 
 	return (0);
 }
 
 static void
 ext2_dec_nlink(struct inode *ip)
 {
 
 	if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2)
 		ip->i_nlink--;
 }
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ext2_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct inode *ip, *xp, *dp;
 	struct dirtemplate *dirbuf;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
 	u_char namlen;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ext2_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  Temporarily just warn if they do.
 	 */
 	if (fvp == tvp) {
 		SDT_PROBE2(ext2fs, , vnops, trace, 1,
 		    "rename: fvp == tvp (can't happen)");
 		error = 0;
 		goto abortit;
 	}
 
 	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto abortit;
 	dp = VTOI(fdvp);
 	ip = VTOI(fvp);
 	if (ip->i_nlink >= EXT4_LINK_MAX &&
 	    !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
 		VOP_UNLOCK(fvp, 0);
 		error = EMLINK;
 		goto abortit;
 	}
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (dp->i_flags & APPEND)) {
 		VOP_UNLOCK(fvp, 0);
 		error = EPERM;
 		goto abortit;
 	}
 	if ((ip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
 		    (ip->i_flag & IN_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->i_flag |= IN_RENAME;
 		oldparent = dp->i_number;
 		doingdirectory++;
 	}
 	vrele(fdvp);
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTOI(tdvp);
 	xp = NULL;
 	if (tvp)
 		xp = VTOI(tvp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	ext2_inc_nlink(ip);
 	ip->i_flag |= IN_CHANGE;
 	if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) {
 		VOP_UNLOCK(fvp, 0);
 		goto bad;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to checkpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (oldparent != dp->i_number)
 		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		error = ext2_checkpath(ip, dp, tcnp->cn_cred);
 		if (error)
 			goto out;
 		VREF(tdvp);
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		vrele(tdvp);
 		dp = VTOI(tdvp);
 		xp = NULL;
 		if (tvp)
 			xp = VTOI(tvp);
 	}
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (xp == NULL) {
 		if (dp->i_devvp != ip->i_devvp)
 			panic("ext2_rename: EXDEV");
 		/*
 		 * Account for ".." in new directory.
 		 * When source and destination have the same
 		 * parent we don't fool with the link count.
 		 */
 		if (doingdirectory && newparent) {
 			error = ext2_inc_nlink(dp);
 			if (error)
 				goto bad;
 
 			dp->i_flag |= IN_CHANGE;
 			error = ext2_update(tdvp, !DOINGASYNC(tdvp));
 			if (error)
 				goto bad;
 		}
 		error = ext2_direnter(ip, tdvp, tcnp);
 		if (error) {
 			if (doingdirectory && newparent) {
 				ext2_dec_nlink(dp);
 				dp->i_flag |= IN_CHANGE;
 				(void)ext2_update(tdvp, 1);
 			}
 			goto bad;
 		}
 		vput(tdvp);
 	} else {
 		if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp)
 			panic("ext2_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (xp->i_number == ip->i_number)
 			panic("ext2_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the user must
 		 * own the parent directory, or the destination of the rename,
 		 * otherwise the destination may not be changed (except by
 		 * root). This implements append-only directories.
 		 */
 		if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 &&
 		    tcnp->cn_cred->cr_uid != dp->i_uid &&
 		    xp->i_uid != tcnp->cn_cred->cr_uid) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode & IFMT) == IFDIR) {
 			if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = ext2_dirrewrite(dp, ip, tcnp);
 		if (error)
 			goto bad;
 		/*
 		 * If the target directory is in the same
 		 * directory as the source directory,
 		 * decrement the link count on the parent
 		 * of the target directory.
 		 */
 		if (doingdirectory && !newparent) {
 			ext2_dec_nlink(dp);
 			dp->i_flag |= IN_CHANGE;
 		}
 		vput(tdvp);
 		/*
 		 * Adjust the link count of the target to
 		 * reflect the dirrewrite above.  If this is
 		 * a directory it is empty and there are
 		 * no links to it, so we can squash the inode and
 		 * any space associated with it.  We disallowed
 		 * renaming over top of a directory with links to
 		 * it above, as the remaining link would point to
 		 * a directory without "." or ".." entries.
 		 */
 		ext2_dec_nlink(xp);
 		if (doingdirectory) {
 			if (--xp->i_nlink != 0)
 				panic("ext2_rename: linked directory");
 			error = ext2_truncate(tvp, (off_t)0, IO_SYNC,
 			    tcnp->cn_cred, tcnp->cn_thread);
 		}
 		xp->i_flag |= IN_CHANGE;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * 3) Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	VREF(fdvp);
 	error = relookup(fdvp, &fvp, fcnp);
 	if (error == 0)
 		vrele(fdvp);
 	if (fvp != NULL) {
 		xp = VTOI(fvp);
 		dp = VTOI(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.  IN_RENAME is not sufficient
 		 * to protect against directory races due to timing windows,
 		 * so we can't panic here.
 		 */
 		vrele(ap->a_fvp);
 		return (0);
 	}
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; its link
 	 * count of three would cause a rmdir to fail with ENOTEMPTY.
 	 * The IN_RENAME flag ensures that it cannot be moved by another
 	 * rename.
 	 */
 	if (xp != ip) {
 		/*
 		 * From name resolves to a different inode.  IN_RENAME is
 		 * not sufficient protection against timing window races
 		 * so we can't panic here.
 		 */
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			ext2_dec_nlink(dp);
 			dp->i_flag |= IN_CHANGE;
 			dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO);
 			if (!dirbuf) {
 				error = ENOMEM;
 				goto bad;
 			}
 			error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf,
 			    ip->i_e2fs->e2fs_bsize, (off_t)0,
 			    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 			    tcnp->cn_cred, NOCRED, NULL, NULL);
 			if (error == 0) {
 				/* Like ufs little-endian: */
 				namlen = dirbuf->dotdot_type;
 				if (namlen != 2 ||
 				    dirbuf->dotdot_name[0] != '.' ||
 				    dirbuf->dotdot_name[1] != '.') {
 					ext2_dirbad(xp, (doff_t)12,
 					    "rename: mangled dir");
 				} else {
 					dirbuf->dotdot_ino = newparent;
 					/*
 					 * dirblock 0 could be htree root,
 					 * try both csum update functions.
 					 */
 					ext2_dirent_csum_set(ip,
 					    (struct ext2fs_direct_2 *)dirbuf);
 					ext2_dx_csum_set(ip,
 					    (struct ext2fs_direct_2 *)dirbuf);
 					(void)vn_rdwr(UIO_WRITE, fvp,
 					    (caddr_t)dirbuf,
 					    ip->i_e2fs->e2fs_bsize,
 					    (off_t)0, UIO_SYSSPACE,
 					    IO_NODELOCKED | IO_SYNC |
 					    IO_NOMACCHECK, tcnp->cn_cred,
 					    NOCRED, NULL, NULL);
 					cache_purge(fdvp);
 				}
 			}
 			free(dirbuf, M_TEMP);
 		}
 		error = ext2_dirremove(fdvp, fcnp);
 		if (!error) {
 			ext2_dec_nlink(xp);
 			xp->i_flag |= IN_CHANGE;
 		}
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
 		vput(fdvp);
 	if (xp)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	if (xp)
 		vput(ITOV(xp));
 	vput(ITOV(dp));
 out:
 	if (doingdirectory)
 		ip->i_flag &= ~IN_RENAME;
 	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
 		ext2_dec_nlink(ip);
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
     mode_t dmode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *dacl, *acl;
 
 	acl = acl_alloc(M_WAITOK);
 	dacl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL from parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.  If the ACL is empty, fall through to
 		 * the "not defined or available" case.
 		 */
 		if (acl->acl_cnt != 0) {
 			dmode = acl_posix1e_newfilemode(dmode, acl);
 			ip->i_mode = dmode;
 			*dacl = *acl;
 			ext2_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = dmode;
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	if (error == 0)
 		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above
 		 * was supposed to free acl.
 		 */
 #ifdef DEBUG
 		printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 #endif	/* DEBUG */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 	acl_free(dacl);
 
 	return (error);
 }
 
 static int
 ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
     mode_t mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *acl;
 
 	acl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL for parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.
 		 */
 		if (acl->acl_cnt != 0) {
 			/*
 			 * Two possible ways for default ACL to not
 			 * be present.  First, the EA can be
 			 * undefined, or second, the default ACL can
 			 * be blank.  If it's blank, fall through to
 			 * the it's not defined case.
 			 */
 			mode = acl_posix1e_newfilemode(mode, acl);
 			ip->i_mode = mode;
 			ext2_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = mode;
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above was
 		 * supposed to free acl.
 		 */
 		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()\n");
 		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()"); */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 
 	return (error);
 }
 
 #endif /* UFS_ACL */
 
 /*
  * Mkdir system call
  */
 static int
 ext2_mkdir(struct vop_mkdir_args *ap)
 {
 	struct m_ext2fs *fs;
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct dirtemplate dirtemplate, *dtp;
 	char *buf = NULL;
 	int error, dmode;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX &&
 	    !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ext2_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	fs = ip->i_e2fs;
 	ip->i_gid = dp->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 2;
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 	error = ext2_update(tvp, 1);
 
 	/*
 	 * Bump link count in parent directory
 	 * to reflect work done below.  Should
 	 * be done before reference is created
 	 * so reparation is possible if we crash.
 	 */
 	ext2_inc_nlink(dp);
 	dp->i_flag |= IN_CHANGE;
 	error = ext2_update(dvp, !DOINGASYNC(dvp));
 	if (error)
 		goto bad;
 
 	/* Initialize directory with "." and ".." from static template. */
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs,
 	    EXT2F_INCOMPAT_FTYPE))
 		dtp = &mastertemplate;
 	else
 		dtp = &omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	/*
 	 * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's
 	 * just redefine it - for this function only
 	 */
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  VTOI(dvp)->i_e2fs->e2fs_bsize
 	dirtemplate.dotdot_reclen = DIRBLKSIZ - 12;
 	buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO);
 	if (!buf) {
 		error = ENOMEM;
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		dirtemplate.dotdot_reclen -= sizeof(struct ext2fs_direct_tail);
 		ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ));
 	}
 	memcpy(buf, &dirtemplate, sizeof(dirtemplate));
 	ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf);
 	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf,
 	    DIRBLKSIZ, (off_t)0, UIO_SYSSPACE,
 	    IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED,
 	    NULL, NULL);
 	if (error) {
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
 		/* XXX should grow with balloc() */
 		panic("ext2_mkdir: blksize");
 	else {
 		ip->i_size = DIRBLKSIZ;
 		ip->i_flag |= IN_CHANGE;
 	}
 
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 
 #endif /* UFS_ACL */
 
 	/* Directory set up, now install its entry in the parent directory. */
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error) {
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 	}
 bad:
 	/*
 	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
 	 * for us because we set the link count to 0.
 	 */
 	if (error) {
 		ip->i_nlink = 0;
 		ip->i_flag |= IN_CHANGE;
 		vput(tvp);
 	} else
 		*ap->a_vpp = tvp;
 out:
 	free(buf, M_TEMP);
 	return (error);
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  DEV_BSIZE
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ext2_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	error = ext2_dirremove(dvp, cnp);
 	if (error)
 		goto out;
 	ext2_dec_nlink(dp);
 	dp->i_flag |= IN_CHANGE;
 	cache_purge(dvp);
 	VOP_UNLOCK(dvp, 0);
 	/*
 	 * Truncate inode.  The only stuff left
 	 * in the directory is "." and "..".
 	 */
 	ip->i_nlink = 0;
 	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
 	    cnp->cn_thread);
 	cache_purge(ITOV(ip));
 	if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ext2_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, (char *)ip->i_shortlink, len);
 		ip->i_size = len;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target),
 		    len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ext2_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	int isize;
 
 	isize = ip->i_size;
 	if (isize < vp->v_mount->mnt_maxsymlinklen) {
 		uiomove((char *)ip->i_shortlink, isize, ap->a_uio);
 		return (0);
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ext2_bmaparray() operation may not
  * deadlock on memory.  See ext2_bmap() for details.
  */
 static int
 ext2_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 	daddr_t blkno;
 	int error;
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("ext2_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 
 		if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS)
 			error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL);
 		else
 			error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL);
 
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = VFSTOEXT2(vp->v_mount)->um_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ext2_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ext2fifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ext2 kqfilter routines if needed
  */
 static int
 ext2fifo_kqfilter(struct vop_kqfilter_args *ap)
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ext2 filesystems.
  */
 static int
 ext2_pathconf(struct vop_pathconf_args *ap)
 {
 	int error = 0;
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs,
 		    EXT2F_ROCOMPAT_DIR_NLINK))
 			*ap->a_retval = INT_MAX;
 		else
 			*ap->a_retval = EXT4_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 
 #ifdef UFS_ACL
 	case _PC_ACL_EXTENDED:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 #endif /* UFS_ACL */
 
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1;	/* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 static int
 ext2_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	error = ENOATTR;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name);
 		if (error != ENOATTR)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve a named extended attribute.
  */
 static int
 ext2_getextattr(struct vop_getextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	error = ENOATTR;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_get(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio, ap->a_size);
 		if (error != ENOATTR)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_get(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio, ap->a_size);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 ext2_listextattr(struct vop_listextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_list(ip, ap->a_attrnamespace,
 		    ap->a_uio, ap->a_size);
 		if (error)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_list(ip, ap->a_attrnamespace,
 		    ap->a_uio, ap->a_size);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 ext2_setextattr(struct vop_setextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name);
 	if (error)
 		return (error);
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_set(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio);
 		if (error != ENOSPC)
 			return (error);
 	}
 
 	error = ext2_extattr_block_set(ip, ap->a_attrnamespace,
 	    ap->a_name, ap->a_uio);
 
 	return (error);
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 static int
 ext2_vptofh(struct vop_vptofh_args *ap)
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(ap->a_vp);
 	ufhp = (struct ufid *)ap->a_fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp)
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	/*
 	 * Only unallocated inodes should be of type VNON.
 	 */
 	if (ip->i_mode != 0 && vp->v_type == VNON)
 		return (EINVAL);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 
 	if (ip->i_number == EXT2_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	ip->i_modrev = init_va_filerev();
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  */
 static int
 ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp)
 {
 	struct inode *ip, *pdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_makeinode: no name");
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp);
 	if (error) {
 		return (error);
 	}
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are
 		 * not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			mode &= ~07111;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 1;
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) {
 		if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID))
 			ip->i_mode &= ~ISGID;
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = ext2_update(tvp, !DOINGASYNC(tvp));
 	if (error)
 		goto bad;
 
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* UFS_ACL */
 
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error)
 		goto bad;
 
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_nlink = 0;
 	ip->i_flag |= IN_CHANGE;
 	vput(tvp);
 	return (error);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 ext2_read(struct vop_read_args *ap)
 {
 	struct vnode *vp;
 	struct inode *ip;
 	struct uio *uio;
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error, orig_resid, seqcount;
 	int ioflag;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_READ)
 		panic("%s: mode", "ext2_read");
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("%s: short symlink", "ext2_read");
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("%s: type %d", "ext2_read", vp->v_type);
 #endif
 	orig_resid = uio->uio_resid;
 	KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0"));
 	if (orig_resid == 0)
 		return (0);
 	KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
 	fs = ip->i_e2fs;
 	if (uio->uio_offset < ip->i_size &&
 	    uio->uio_offset >= fs->e2fs_maxfilesize)
 		return (EOVERFLOW);
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = blksize(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 
 		xfersize = fs->e2fs_fsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) >= ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, ip->i_size, lbn, size,
 			    NOCRED, blkoffset + uio->uio_resid, seqcount,
 			    0, &bp);
 		} else if (seqcount > 1) {
 			u_int nextsize = blksize(fs, ip, nextlbn);
 
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			bp = NULL;
 			break;
 		}
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 		error = uiomove((char *)bp->b_data + blkoffset,
 		    (int)xfersize, uio);
 		if (error)
 			break;
 		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
 	 * This can only happen in the case of an error because the loop
 	 * above resets bp to NULL on each iteration and on normal
 	 * completion has not set a new value into it. so it must have come
 	 * from a 'break' statement
 	 */
 	if (bp != NULL)
 		vfs_bio_brelse(bp, ioflag);
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		ip->i_flag |= IN_ACCESS;
 	return (error);
 }
 
 static int
 ext2_ioctl(struct vop_ioctl_args *ap)
 {
 
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 	case FIOSEEKHOLE:
 		return (vn_bmap_seekhole(ap->a_vp, ap->a_command,
 		    (off_t *)ap->a_data, ap->a_cred));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * Vnode op for writing.
  */
 static int
 ext2_write(struct vop_write_args *ap)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
 
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_WRITE)
 		panic("%s: mode", "ext2_write");
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		/* XXX differs from ffs -- this is called from ext2_mkdir(). */
 		if ((ioflag & IO_SYNC) == 0)
 			panic("ext2_write: nonsync dir write");
 		break;
 	default:
 		panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp,
 		    vp->v_type, (intmax_t)uio->uio_offset,
 		    (intmax_t)uio->uio_resid);
 	}
 
 	KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0"));
 	KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0"));
 	fs = ip->i_e2fs;
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize)
 		return (EFBIG);
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	if (seqcount > BA_SEQMAX)
 		flags = BA_SEQMAX << BA_SEQSHIFT;
 	else
 		flags = seqcount << BA_SEQSHIFT;
 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 		flags |= IO_SYNC;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->e2fs_fsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 
 		/*
 		 * We must perform a read-before-write if the transfer size
 		 * does not cover the entire buffer.
 		 */
 		if (fs->e2fs_bsize > xfersize)
 			flags |= BA_CLRBUF;
 		else
 			flags &= ~BA_CLRBUF;
 		error = ext2_balloc(ip, lbn, blkoffset + xfersize,
 		    ap->a_cred, &bp, flags);
 		if (error != 0)
 			break;
 
 		if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL))
 			bp->b_flags |= B_NOCACHE;
 		if (uio->uio_offset + xfersize > ip->i_size)
 			ip->i_size = uio->uio_offset + xfersize;
 		size = blksize(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		/*
 		 * If the buffer is not already filled and we encounter an
 		 * error while trying to fill it, we have to clear out any
 		 * garbage data from the pages instantiated for the buffer.
 		 * If we do not, a failed uiomove() during a write can leave
 		 * the prior contents of the pages exposed to a userland mmap.
 		 *
 		 * Note that we need only clear buffers with a transfer size
 		 * equal to the block size because buffers with a shorter
 		 * transfer size were cleared above by the call to ext2_balloc()
 		 * with the BA_CLRBUF flag set.
 		 *
 		 * If the source region for uiomove identically mmaps the
 		 * buffer, uiomove() performed the NOP copy, and the buffer
 		 * content remains valid because the page fault handler
 		 * validated the pages.
 		 */
 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 		    fs->e2fs_bsize == xfersize)
 			vfs_bio_clrbuf(bp);
 
 		vfs_bio_set_flags(bp, ioflag);
 
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
 		 * asynchronously.  Otherwise try to cluster, and if that
 		 * doesn't do it then either do an async write (if O_DIRECT),
 		 * or a delayed write (if not).
 		 */
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() ||
 			    buf_dirty_count_severe() ||
 		    (ioflag & IO_ASYNC)) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else if (xfersize + blkoffset == fs->e2fs_fsize) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
 				cluster_write(vp, bp, ip->i_size, seqcount, 0);
 			} else {
 				bawrite(bp);
 			}
 		} else if (ioflag & IO_DIRECT) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		if (error || xfersize == 0)
 			break;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 	    ap->a_cred) {
 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID))
 			ip->i_mode &= ~(ISUID | ISGID);
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)ext2_truncate(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	}
 	if (uio->uio_resid != resid) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (ioflag & IO_SYNC)
 			error = ext2_update(vp, 1);
 	}
 	return (error);
 }
Index: head/sys/fs/fdescfs/fdesc_vnops.c
===================================================================
--- head/sys/fs/fdescfs/fdesc_vnops.c	(revision 355789)
+++ head/sys/fs/fdescfs/fdesc_vnops.c	(revision 355790)
@@ -1,660 +1,661 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 struct mtx fdesc_hashmtx;
 
 static vop_getattr_t	fdesc_getattr;
 static vop_lookup_t	fdesc_lookup;
 static vop_open_t	fdesc_open;
 static vop_pathconf_t	fdesc_pathconf;
 static vop_readdir_t	fdesc_readdir;
 static vop_readlink_t	fdesc_readlink;
 static vop_reclaim_t	fdesc_reclaim;
 static vop_setattr_t	fdesc_setattr;
 
 static struct vop_vector fdesc_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		VOP_NULL,
 	.vop_getattr =		fdesc_getattr,
 	.vop_lookup =		fdesc_lookup,
 	.vop_open =		fdesc_open,
 	.vop_pathconf =		fdesc_pathconf,
 	.vop_readdir =		fdesc_readdir,
 	.vop_readlink =		fdesc_readlink,
 	.vop_reclaim =		fdesc_reclaim,
 	.vop_setattr =		fdesc_setattr,
 };
+VFS_VOP_VECTOR_REGISTER(fdesc_vnodeops);
 
 static void fdesc_insmntque_dtr(struct vnode *, void *);
 static void fdesc_remove_entry(struct fdescnode *);
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(struct vfsconf *vfsp)
 {
 
 	mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF);
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 /*
  * Uninit ready for unload.
  */
 int
 fdesc_uninit(struct vfsconf *vfsp)
 {
 
 	hashdestroy(fdhashtbl, M_CACHE, fdhash);
 	mtx_destroy(&fdesc_hashmtx);
 	return (0);
 }
 
 /*
  * If allocating vnode fails, call this.
  */
 static void
 fdesc_insmntque_dtr(struct vnode *vp, void *arg)
 {
 
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Remove an entry from the hash if it exists.
  */
 static void
 fdesc_remove_entry(struct fdescnode *fd)
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd2;
 
 	fc = FD_NHASH(fd->fd_ix);
 	mtx_lock(&fdesc_hashmtx);
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd == fd2) {
 			LIST_REMOVE(fd, fd_hash);
 			break;
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 }
 
 int
 fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp,
     struct vnode **vpp)
 {
 	struct fdescmount *fmp;
 	struct fdhashhead *fc;
 	struct fdescnode *fd, *fd2;
 	struct vnode *vp, *vp2;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	fc = FD_NHASH(ix);
 loop:
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		return (-1);
 	}
 
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp = fd->fd_vnode;
 			VI_LOCK(vp);
 			mtx_unlock(&fdesc_hashmtx);
 			if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td))
 				goto loop;
 			*vpp = vp;
 			return (0);
 		}
 	}
 	mtx_unlock(&fdesc_hashmtx);
 
 	fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp);
 	if (error) {
 		free(fd, M_TEMP);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_data = fd;
 	fd->fd_vnode = vp;
 	fd->fd_type = ftype;
 	fd->fd_fd = fd_fd;
 	fd->fd_ix = ix;
 	if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF)
 		vp->v_vflag |= VV_READLINK;
 	error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL);
 	if (error != 0) {
 		*vpp = NULLVP;
 		return (error);
 	}
 
 	/* Make sure that someone didn't beat us when inserting the vnode. */
 	mtx_lock(&fdesc_hashmtx);
 	/*
 	 * If a forced unmount is progressing, we need to drop it. The flags are
 	 * protected by the hashmtx.
 	 */
 	fmp = mp->mnt_data;
 	if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) {
 		mtx_unlock(&fdesc_hashmtx);
 		vgone(vp);
 		vput(vp);
 		*vpp = NULLVP;
 		return (-1);
 	}
 
 	LIST_FOREACH(fd2, fc, fd_hash) {
 		if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) {
 			/* Get reference to vnode in case it's being free'd */
 			vp2 = fd2->fd_vnode;
 			VI_LOCK(vp2);
 			mtx_unlock(&fdesc_hashmtx);
 			error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td);
 			/* Someone beat us, dec use count and wait for reclaim */
 			vgone(vp);
 			vput(vp);
 			/* If we didn't get it, return no vnode. */
 			if (error)
 				vp2 = NULLVP;
 			*vpp = vp2;
 			return (error);
 		}
 	}
 
 	/* If we came here, we can insert it safely. */
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 	mtx_unlock(&fdesc_hashmtx);
 	*vpp = vp;
 	return (0);
 }
 
 struct fdesc_get_ino_args {
 	fdntype ftype;
 	unsigned fd_fd;
 	int ix;
 	struct file *fp;
 	struct thread *td;
 };
 
 static int
 fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 	struct fdesc_get_ino_args *a;
 	int error;
 
 	a = arg;
 	error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp);
 	fdrop(a->fp, a->td);
 	return (error);
 }
 
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
 	struct file *fp;
 	struct fdesc_get_ino_args arg;
 	int nlen = cnp->cn_namelen;
 	u_int fd, fd1;
 	int error;
 	struct vnode *fvp;
 
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad;
 	}
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd1 = 10 * fd + *pname++ - '0';
 		if (fd1 < fd) {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = fd1;
 	}
 
 	/*
 	 * No rights to check since 'fp' isn't actually used.
 	 */
 	if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0)
 		goto bad;
 
 	/* Check if we're looking up ourselves. */
 	if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) {
 		/*
 		 * In case we're holding the last reference to the file, the dvp
 		 * will be re-acquired.
 		 */
 		vhold(dvp);
 		VOP_UNLOCK(dvp, 0);
 		fdrop(fp, td);
 
 		/* Re-aquire the lock afterwards. */
 		vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(dvp);
 		fvp = dvp;
 		if (VN_IS_DOOMED(dvp))
 			error = ENOENT;
 	} else {
 		/*
 		 * Unlock our root node (dvp) when doing this, since we might
 		 * deadlock since the vnode might be locked by another thread
 		 * and the root vnode lock will be obtained afterwards (in case
 		 * we're looking up the fd of the root vnode), which will be the
 		 * opposite lock order. Vhold the root vnode first so we don't
 		 * lose it.
 		 */
 		arg.ftype = Fdesc;
 		arg.fd_fd = fd;
 		arg.ix = FD_DESC + fd;
 		arg.fp = fp;
 		arg.td = td;
 		error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg,
 		    LK_EXCLUSIVE, &fvp);
 	}
 
 	if (error)
 		goto bad;
 	*vpp = fvp;
 	return (0);
 
 bad:
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_pathconf(struct vop_pathconf_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error;
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			*ap->a_retval = 2;
 		else
 			*ap->a_retval = 1;
 		return (0);
 	default:
 		if (VTOFDESC(vp)->fd_type == Froot)
 			return (vop_stdpathconf(ap));
 		vref(vp);
 		VOP_UNLOCK(vp, 0);
 		error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd,
 		    ap->a_name, ap->a_retval);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		vunref(vp);
 		return (error);
 	}
 }
 
 static int
 fdesc_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct timeval boottime;
 
 	getboottime(&boottime);
 	vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 	vap->va_fileid = VTOFDESC(vp)->fd_ix;
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_atime.tv_sec = boottime.tv_sec;
 	vap->va_atime.tv_nsec = 0;
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_rdev = NODEV;
 		break;
 
 	case Fdesc:
 		vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK;
 		vap->va_nlink = 1;
 		vap->va_size = 0;
 		vap->va_rdev = makedev(0, vap->va_fileid);
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	vp->v_type = vap->va_type;
 	return (0);
 }
 
 static int
 fdesc_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	struct thread *td = curthread;
 	cap_rights_t rights;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(td, fd,
 	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = fp->f_vnode;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */
 
 static int
 fdesc_readdir(struct vop_readdir_args *ap)
 {
 	struct fdescmount *fmp;
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	fmp = VFSTOFDESC(ap->a_vp->v_mount);
 	if (ap->a_ncookies != NULL)
 		*ap->a_ncookies = 0;
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
 	FILEDESC_SLOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		bzero((caddr_t)dp, UIO_MX);
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_type = DT_DIR;
 			dirent_terminate(dp);
 			break;
 		default:
 			if (fdp->fd_ofiles[fcnt].fde_file == NULL)
 				break;
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ?
 			    DT_CHR : DT_LNK;
 			dp->d_fileno = i + FD_DESC;
 			dirent_terminate(dp);
 			break;
 		}
 		/* NOTE: d_off is the offset of the *next* entry. */
 		dp->d_off = UIO_MX * (i + 1);
 		if (dp->d_namlen != 0) {
 			/*
 			 * And ship to userland
 			 */
 			FILEDESC_SUNLOCK(fdp);
 			error = uiomove(dp, UIO_MX, uio);
 			if (error)
 				goto done;
 			FILEDESC_SLOCK(fdp);
 		}
 		i++;
 		fcnt++;
 	}
 	FILEDESC_SUNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct fdescnode *fd;
 
  	vp = ap->a_vp;
  	fd = VTOFDESC(vp);
 	fdesc_remove_entry(fd);
 	free(vp->v_data, M_TEMP);
 	vp->v_data = NULL;
 	return (0);
 }
 
 static int
 fdesc_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vp, *vn;
 	struct thread *td;
 	struct uio *uio;
 	struct file *fp;
 	char *freepath, *fullpath;
 	size_t pathlen;
 	int lockflags, fd_fd;
 	int error;
 
 	freepath = NULL;
 	vn = va->a_vp;
 	if (VTOFDESC(vn)->fd_type != Fdesc)
 		panic("fdesc_readlink: not fdescfs link");
 	fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd;
 	lockflags = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	td = curthread;
 	error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL);
 	if (error != 0)
 		goto out;
 
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		error = vn_fullpath(td, vp, &fullpath, &freepath);
 		break;
 	default:
 		fullpath = "anon_inode:[unknown]";
 		break;
 	}
 	if (error == 0) {
 		uio = va->a_uio;
 		pathlen = strlen(fullpath);
 		error = uiomove(fullpath, pathlen, uio);
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 	fdrop(fp, td);
 
 out:
 	vn_lock(vn, lockflags | LK_RETRY);
 	return (error);
 }
Index: head/sys/fs/fifofs/fifo_vnops.c
===================================================================
--- head/sys/fs/fifofs/fifo_vnops.c	(revision 355789)
+++ head/sys/fs/fifofs/fifo_vnops.c	(revision 355790)
@@ -1,360 +1,361 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2005 Robert N. M. Watson
  * Copyright (c) 2012 Giovanni Trematerra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fifo_vnops.c	8.10 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/event.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/selinfo.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/un.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 /*
  * This structure is associated with the FIFO vnode and stores
  * the state associated with the FIFO.
  * Notes about locking:
  *   - fi_pipe is invariant since init time.
  *   - fi_readers and fi_writers are protected by the vnode lock.
  */
 struct fifoinfo {
 	struct pipe *fi_pipe;
 	long	fi_readers;
 	long	fi_writers;
 	u_int	fi_rgen;
 	u_int	fi_wgen;
 };
 
 static vop_print_t	fifo_print;
 static vop_open_t	fifo_open;
 static vop_close_t	fifo_close;
 static vop_advlock_t	fifo_advlock;
 
 struct vop_vector fifo_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_advlock =		fifo_advlock,
 	.vop_close =		fifo_close,
 	.vop_create =		VOP_PANIC,
 	.vop_getattr =		VOP_EBADF,
 	.vop_ioctl =		VOP_PANIC,
 	.vop_kqfilter =		VOP_PANIC,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		fifo_open,
 	.vop_pathconf =		VOP_PANIC,
 	.vop_print =		fifo_print,
 	.vop_read =		VOP_PANIC,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		VOP_NULL,
 	.vop_remove =		VOP_PANIC,
 	.vop_rename =		VOP_PANIC,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		VOP_EBADF,
 	.vop_symlink =		VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 };
+VFS_VOP_VECTOR_REGISTER(fifo_specops);
 
 /*
  * Dispose of fifo resources.
  */
 static void
 fifo_cleanup(struct vnode *vp)
 {
 	struct fifoinfo *fip;
 
 	ASSERT_VOP_ELOCKED(vp, "fifo_cleanup");
 	fip = vp->v_fifoinfo;
 	if (fip->fi_readers == 0 && fip->fi_writers == 0) {
 		vp->v_fifoinfo = NULL;
 		pipe_dtor(fip->fi_pipe);
 		free(fip, M_VNODE);
 	}
 }
 
 /*
  * Open called to set up a new instance of a fifo or
  * to find an active instance of a fifo.
  */
 /* ARGSUSED */
 static int
 fifo_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 		struct file *a_fp;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct thread *td;
 	struct fifoinfo *fip;
 	struct pipe *fpipe;
 	u_int gen;
 	int error, stops_deferred;
 
 	vp = ap->a_vp;
 	fp = ap->a_fp;
 	td = ap->a_td;
 	ASSERT_VOP_ELOCKED(vp, "fifo_open");
 	if (fp == NULL || (ap->a_mode & FEXEC) != 0)
 		return (EINVAL);
 	if ((fip = vp->v_fifoinfo) == NULL) {
 		pipe_named_ctor(&fpipe, td);
 		fip = malloc(sizeof(*fip), M_VNODE, M_WAITOK);
 		fip->fi_pipe = fpipe;
 		fpipe->pipe_wgen = fip->fi_readers = fip->fi_writers = 0;
  		KASSERT(vp->v_fifoinfo == NULL, ("fifo_open: v_fifoinfo race"));
 		vp->v_fifoinfo = fip;
 	}
 	fpipe = fip->fi_pipe;
  	KASSERT(fpipe != NULL, ("fifo_open: pipe is NULL"));
 
 	/*
 	 * Use the pipe mutex here, in addition to the vnode lock,
 	 * in order to allow vnode lock dropping before msleep() calls
 	 * and still avoiding missed wakeups.
 	 */
 	PIPE_LOCK(fpipe);
 	if (ap->a_mode & FREAD) {
 		fip->fi_readers++;
 		fip->fi_rgen++;
 		if (fip->fi_readers == 1) {
 			fpipe->pipe_state &= ~PIPE_EOF;
 			if (fip->fi_writers > 0)
 				wakeup(&fip->fi_writers);
 		}
 		fp->f_pipegen = fpipe->pipe_wgen - fip->fi_writers;
 	}
 	if (ap->a_mode & FWRITE) {
 		if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
 			PIPE_UNLOCK(fpipe);
 			if (fip->fi_writers == 0)
 				fifo_cleanup(vp);
 			return (ENXIO);
 		}
 		fip->fi_writers++;
 		fip->fi_wgen++;
 		if (fip->fi_writers == 1) {
 			fpipe->pipe_state &= ~PIPE_EOF;
 			if (fip->fi_readers > 0)
 				wakeup(&fip->fi_readers);
 		}
 	}
 	if ((ap->a_mode & O_NONBLOCK) == 0) {
 		if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
 			gen = fip->fi_wgen;
 			VOP_UNLOCK(vp, 0);
 			stops_deferred = sigdeferstop(SIGDEFERSTOP_OFF);
 			error = msleep(&fip->fi_readers, PIPE_MTX(fpipe),
 			    PDROP | PCATCH | PSOCK, "fifoor", 0);
 			sigallowstop(stops_deferred);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (error != 0 && gen == fip->fi_wgen) {
 				fip->fi_readers--;
 				if (fip->fi_readers == 0) {
 					PIPE_LOCK(fpipe);
 					fpipe->pipe_state |= PIPE_EOF;
 					if (fpipe->pipe_state & PIPE_WANTW)
 						wakeup(fpipe);
 					PIPE_UNLOCK(fpipe);
 					fifo_cleanup(vp);
 				}
 				return (error);
 			}
 			PIPE_LOCK(fpipe);
 			/*
 			 * We must have got woken up because we had a writer.
 			 * That (and not still having one) is the condition
 			 * that we must wait for.
 			 */
 		}
 		if ((ap->a_mode & FWRITE) && fip->fi_readers == 0) {
 			gen = fip->fi_rgen;
 			VOP_UNLOCK(vp, 0);
 			stops_deferred = sigdeferstop(SIGDEFERSTOP_OFF);
 			error = msleep(&fip->fi_writers, PIPE_MTX(fpipe),
 			    PDROP | PCATCH | PSOCK, "fifoow", 0);
 			sigallowstop(stops_deferred);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			if (error != 0 && gen == fip->fi_rgen) {
 				fip->fi_writers--;
 				if (fip->fi_writers == 0) {
 					PIPE_LOCK(fpipe);
 					fpipe->pipe_state |= PIPE_EOF;
 					if (fpipe->pipe_state & PIPE_WANTR)
 						wakeup(fpipe);
 					fpipe->pipe_wgen++;
 					PIPE_UNLOCK(fpipe);
 					fifo_cleanup(vp);
 				}
 				return (error);
 			}
 			/*
 			 * We must have got woken up because we had
 			 * a reader.  That (and not still having one)
 			 * is the condition that we must wait for.
 			 */
 			PIPE_LOCK(fpipe);
 		}
 	}
 	PIPE_UNLOCK(fpipe);
 	KASSERT(fp != NULL, ("can't fifo/vnode bypass"));
 	KASSERT(fp->f_ops == &badfileops, ("not badfileops in fifo_open"));
 	finit(fp, fp->f_flag, DTYPE_FIFO, fpipe, &pipeops);
 	return (0);
 }
 
 /*
  * Device close routine
  */
 /* ARGSUSED */
 static int
 fifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct fifoinfo *fip;
 	struct pipe *cpipe;
 
 	vp = ap->a_vp;
 	fip = vp->v_fifoinfo;
 	cpipe = fip->fi_pipe;
 	ASSERT_VOP_ELOCKED(vp, "fifo_close");
 	if (ap->a_fflag & FREAD) {
 		fip->fi_readers--;
 		if (fip->fi_readers == 0) {
 			PIPE_LOCK(cpipe);
 			cpipe->pipe_state |= PIPE_EOF;
 			if ((cpipe->pipe_state & PIPE_WANTW)) {
 				cpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(cpipe);
 			}
 			pipeselwakeup(cpipe);
 			PIPE_UNLOCK(cpipe);
 		}
 	}
 	if (ap->a_fflag & FWRITE) {
 		fip->fi_writers--;
 		if (fip->fi_writers == 0) {
 			PIPE_LOCK(cpipe);
 			cpipe->pipe_state |= PIPE_EOF;
 			if ((cpipe->pipe_state & PIPE_WANTR)) {
 				cpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(cpipe);
 			}
 			cpipe->pipe_wgen++;
 			pipeselwakeup(cpipe);
 			PIPE_UNLOCK(cpipe);
 		}
 	}
 	fifo_cleanup(vp);
 	return (0);
 }
 
 /*
  * Print out internal contents of a fifo vnode.
  */
 int
 fifo_printinfo(vp)
 	struct vnode *vp;
 {
 	struct fifoinfo *fip = vp->v_fifoinfo;
 
 	if (fip == NULL){
 		printf(", NULL v_fifoinfo");
 		return (0);
 	}
 	printf(", fifo with %ld readers and %ld writers",
 		fip->fi_readers, fip->fi_writers);
 	return (0);
 }
 
 /*
  * Print out the contents of a fifo vnode.
  */
 static int
 fifo_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	printf("    ");
 	fifo_printinfo(ap->a_vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Fifo advisory byte-level locks.
  */
 /* ARGSUSED */
 static int
 fifo_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
Index: head/sys/fs/fuse/fuse_vnops.c
===================================================================
--- head/sys/fs/fuse/fuse_vnops.c	(revision 355789)
+++ head/sys/fs/fuse/fuse_vnops.c	(revision 355790)
@@ -1,2487 +1,2489 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Copyright (c) 2019 The FreeBSD Foundation
  *
  * Portions of this software were developed by BFF Storage Systems, LLC under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/extattr.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 #include "fuse_io.h"
 
 #include <sys/priv.h>
 
 /* Maximum number of hardlinks to a single FUSE file */
 #define FUSE_LINK_MAX                      UINT32_MAX
 
 SDT_PROVIDER_DECLARE(fusefs);
 /* 
  * Fuse trace probe:
  * arg0: verbosity.  Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*");
 
 /* vnode ops */
 static vop_access_t fuse_vnop_access;
 static vop_advlock_t fuse_vnop_advlock;
 static vop_bmap_t fuse_vnop_bmap;
 static vop_close_t fuse_fifo_close;
 static vop_close_t fuse_vnop_close;
 static vop_create_t fuse_vnop_create;
 static vop_deleteextattr_t fuse_vnop_deleteextattr;
 static vop_fdatasync_t fuse_vnop_fdatasync;
 static vop_fsync_t fuse_vnop_fsync;
 static vop_getattr_t fuse_vnop_getattr;
 static vop_getextattr_t fuse_vnop_getextattr;
 static vop_inactive_t fuse_vnop_inactive;
 static vop_link_t fuse_vnop_link;
 static vop_listextattr_t fuse_vnop_listextattr;
 static vop_lookup_t fuse_vnop_lookup;
 static vop_mkdir_t fuse_vnop_mkdir;
 static vop_mknod_t fuse_vnop_mknod;
 static vop_open_t fuse_vnop_open;
 static vop_pathconf_t fuse_vnop_pathconf;
 static vop_read_t fuse_vnop_read;
 static vop_readdir_t fuse_vnop_readdir;
 static vop_readlink_t fuse_vnop_readlink;
 static vop_reclaim_t fuse_vnop_reclaim;
 static vop_remove_t fuse_vnop_remove;
 static vop_rename_t fuse_vnop_rename;
 static vop_rmdir_t fuse_vnop_rmdir;
 static vop_setattr_t fuse_vnop_setattr;
 static vop_setextattr_t fuse_vnop_setextattr;
 static vop_strategy_t fuse_vnop_strategy;
 static vop_symlink_t fuse_vnop_symlink;
 static vop_write_t fuse_vnop_write;
 static vop_getpages_t fuse_vnop_getpages;
 static vop_print_t fuse_vnop_print;
 static vop_vptofh_t fuse_vnop_vptofh;
 
 struct vop_vector fuse_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		fuse_vnop_access,
 	.vop_close =		fuse_fifo_close,
 	.vop_fsync =		fuse_vnop_fsync,
 	.vop_getattr =		fuse_vnop_getattr,
 	.vop_inactive =		fuse_vnop_inactive,
 	.vop_pathconf =		fuse_vnop_pathconf,
 	.vop_print =		fuse_vnop_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		fuse_vnop_reclaim,
 	.vop_setattr =		fuse_vnop_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_vptofh =		fuse_vnop_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(fuse_fifoops);
 
 struct vop_vector fuse_vnops = {
 	.vop_allocate =	VOP_EINVAL,
 	.vop_default = &default_vnodeops,
 	.vop_access = fuse_vnop_access,
 	.vop_advlock = fuse_vnop_advlock,
 	.vop_bmap = fuse_vnop_bmap,
 	.vop_close = fuse_vnop_close,
 	.vop_create = fuse_vnop_create,
 	.vop_deleteextattr = fuse_vnop_deleteextattr,
 	.vop_fsync = fuse_vnop_fsync,
 	.vop_fdatasync = fuse_vnop_fdatasync,
 	.vop_getattr = fuse_vnop_getattr,
 	.vop_getextattr = fuse_vnop_getextattr,
 	.vop_inactive = fuse_vnop_inactive,
 	/*
 	 * TODO: implement vop_ioctl after upgrading to protocol 7.16.
 	 * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until
 	 * 7.16.
 	 */
 	.vop_link = fuse_vnop_link,
 	.vop_listextattr = fuse_vnop_listextattr,
 	.vop_lookup = fuse_vnop_lookup,
 	.vop_mkdir = fuse_vnop_mkdir,
 	.vop_mknod = fuse_vnop_mknod,
 	.vop_open = fuse_vnop_open,
 	.vop_pathconf = fuse_vnop_pathconf,
 	/*
 	 * TODO: implement vop_poll after upgrading to protocol 7.21.
 	 * FUSE_POLL was added in protocol 7.11, but it's kind of broken until
 	 * 7.21, which adds the ability for the client to choose which poll
 	 * events it wants, and for a client to deregister a file handle
 	 */
 	.vop_read = fuse_vnop_read,
 	.vop_readdir = fuse_vnop_readdir,
 	.vop_readlink = fuse_vnop_readlink,
 	.vop_reclaim = fuse_vnop_reclaim,
 	.vop_remove = fuse_vnop_remove,
 	.vop_rename = fuse_vnop_rename,
 	.vop_rmdir = fuse_vnop_rmdir,
 	.vop_setattr = fuse_vnop_setattr,
 	.vop_setextattr = fuse_vnop_setextattr,
 	.vop_strategy = fuse_vnop_strategy,
 	.vop_symlink = fuse_vnop_symlink,
 	.vop_write = fuse_vnop_write,
 	.vop_getpages = fuse_vnop_getpages,
 	.vop_print = fuse_vnop_print,
 	.vop_vptofh = fuse_vnop_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(fuse_vnops);
 
 uma_zone_t fuse_pbuf_zone;
 
 /* Check permission for extattr operations, much like extattr_check_cred */
 static int
 fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred,
 	struct thread *td, accmode_t accmode)
 {
 	struct mount *mp = vnode_mount(vp);
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (ns) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) {
 			return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 		}
 		/* FALLTHROUGH */
 	case EXTATTR_NAMESPACE_USER:
 		return (fuse_internal_access(vp, accmode, td, cred));
 	default:
 		return (EPERM);
 	}
 }
 
 /* Get a filehandle for a directory */
 static int
 fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp,
 	struct ucred *cred, pid_t pid)
 {
 	if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0)
 		return 0;
 	return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid);
 }
 
 /* Send FUSE_FLUSH for this vnode */
 static int
 fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
 {
 	struct fuse_flush_in *ffi;
 	struct fuse_filehandle *fufh;
 	struct fuse_dispatcher fdi;
 	struct thread *td = curthread;
 	struct mount *mp = vnode_mount(vp);
 	int err;
 
 	if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH))
 		return 0;
 
 	err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
 	if (err)
 		return err;
 
 	fdisp_init(&fdi, sizeof(*ffi));
 	fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred);
 	ffi = fdi.indata;
 	ffi->fh = fufh->fh_id;
 	/* 
 	 * If the file has a POSIX lock then we're supposed to set lock_owner.
 	 * If not, then lock_owner is undefined.  So we may as well always set
 	 * it.
 	 */
 	ffi->lock_owner = td->td_proc->p_pid;
 
 	err = fdisp_wait_answ(&fdi);
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_FLUSH);
 		err = 0;
 	}
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /* Close wrapper for fifos.  */
 static int
 fuse_fifo_close(struct vop_close_args *ap)
 {
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
     struct vnop_access_args {
 	struct vnode *a_vp;
 #if VOP_ACCESS_TAKES_ACCMODE_T
 	accmode_t a_accmode;
 #else
 	int a_mode;
 #endif
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int accmode = ap->a_accmode;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		if (vnode_isvroot(vp)) {
 			return 0;
 		}
 		return ENXIO;
 	}
 	if (!(data->dataflags & FSESS_INITED)) {
 		if (vnode_isvroot(vp)) {
 			if (priv_check_cred(cred, PRIV_VFS_ADMIN) ||
 			    (fuse_match_cred(data->daemoncred, cred) == 0)) {
 				return 0;
 			}
 		}
 		return EBADF;
 	}
 	if (vnode_islnk(vp)) {
 		return 0;
 	}
 
 	err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred);
 	return err;
 }
 
 /*
  * struct vop_advlock_args {
  *	struct vop_generic_args a_gen;
  *	struct vnode *a_vp;
  *	void *a_id;
  *	int a_op;
  *	struct flock *a_fl;
  *	int a_flags;
  * }
  */
 static int
 fuse_vnop_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct flock *fl = ap->a_fl;
 	struct thread *td = curthread;
 	struct ucred *cred = td->td_ucred;
 	pid_t pid = td->td_proc->p_pid;
 	struct fuse_filehandle *fufh;
 	struct fuse_dispatcher fdi;
 	struct fuse_lk_in *fli;
 	struct fuse_lk_out *flo;
 	enum fuse_opcode op;
 	int dataflags, err;
 	int flags = ap->a_flags;
 
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (!(dataflags & FSESS_POSIX_LOCKS))
 		return vop_stdadvlock(ap);
 	/* FUSE doesn't properly support flock until protocol 7.17 */
 	if (flags & F_FLOCK)
 		return vop_stdadvlock(ap);
 
 	err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid);
 	if (err)
 		return err;
 
 	fdisp_init(&fdi, sizeof(*fli));
 
 	switch(ap->a_op) {
 	case F_GETLK:
 		op = FUSE_GETLK;
 		break;
 	case F_SETLK:
 		op = FUSE_SETLK;
 		break;
 	case F_SETLKW:
 		op = FUSE_SETLKW;
 		break;
 	default:
 		return EINVAL;
 	}
 
 	fdisp_make_vp(&fdi, op, vp, td, cred);
 	fli = fdi.indata;
 	fli->fh = fufh->fh_id;
 	fli->owner = fl->l_pid;
 	fli->lk.start = fl->l_start;
 	if (fl->l_len != 0)
 		fli->lk.end = fl->l_start + fl->l_len - 1;
 	else
 		fli->lk.end = INT64_MAX;
 	fli->lk.type = fl->l_type;
 	fli->lk.pid = fl->l_pid;
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 
 	if (err == 0 && op == FUSE_GETLK) {
 		flo = fdi.answ;
 		fl->l_type = flo->lk.type;
 		fl->l_pid = flo->lk.pid;
 		if (flo->lk.type != F_UNLCK) {
 			fl->l_start = flo->lk.start;
 			if (flo->lk.end == INT64_MAX)
 				fl->l_len = 0;
 			else
 				fl->l_len = flo->lk.end - flo->lk.start + 1;
 			fl->l_start = flo->lk.start;
 		}
 	}
 
 	return err;
 }
 
 /* {
 	struct vnode *a_vp;
 	daddr_t a_bn;
 	struct bufobj **a_bop;
 	daddr_t *a_bnp;
 	int *a_runp;
 	int *a_runb;
 } */
 static int
 fuse_vnop_bmap(struct vop_bmap_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj **bo = ap->a_bop;
 	struct thread *td = curthread;
 	struct mount *mp;
 	struct fuse_dispatcher fdi;
 	struct fuse_bmap_in *fbi;
 	struct fuse_bmap_out *fbo;
 	struct fuse_data *data;
 	uint64_t biosize;
 	off_t filesize;
 	daddr_t lbn = ap->a_bn;
 	daddr_t *pbn = ap->a_bnp;
 	int *runp = ap->a_runp;
 	int *runb = ap->a_runb;
 	int error = 0;
 	int maxrun;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	mp = vnode_mount(vp);
 	data = fuse_get_mpdata(mp);
 	biosize = fuse_iosize(vp);
 	maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1,
 		data->max_readahead_blocks);
 
 	if (bo != NULL)
 		*bo = &vp->v_bufobj;
 
 	/*
 	 * The FUSE_BMAP operation does not include the runp and runb
 	 * variables, so we must guess.  Report nonzero contiguous runs so
 	 * cluster_read will combine adjacent reads.  It's worthwhile to reduce
 	 * upcalls even if we don't know the true physical layout of the file.
 	 * 
 	 * FUSE file systems may opt out of read clustering in two ways:
 	 * * mounting with -onoclusterr
 	 * * Setting max_readahead <= maxbcachebuf during FUSE_INIT
 	 */
 	if (runb != NULL)
 		*runb = MIN(lbn, maxrun);
 	if (runp != NULL) {
 		error = fuse_vnode_size(vp, &filesize, td->td_ucred, td);
 		if (error == 0)
 			*runp = MIN(MAX(0, filesize / (off_t)biosize - lbn - 1),
 				    maxrun);
 		else
 			*runp = 0;
 	}
 
 	if (fsess_isimpl(mp, FUSE_BMAP)) {
 		fdisp_init(&fdi, sizeof(*fbi));
 		fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred);
 		fbi = fdi.indata;
 		fbi->block = lbn;
 		fbi->blocksize = biosize;
 		error = fdisp_wait_answ(&fdi);
 		if (error == ENOSYS) {
 			fdisp_destroy(&fdi);
 			fsess_set_notimpl(mp, FUSE_BMAP);
 			error = 0;
 		} else {
 			fbo = fdi.answ;
 			if (error == 0 && pbn != NULL)
 				*pbn = fbo->block;
 			fdisp_destroy(&fdi);
 			return error;
 		}
 	}
 
 	/* If the daemon doesn't support BMAP, make up a sensible default */
 	if (pbn != NULL)
 		*pbn = lbn * btodb(biosize);
 	return (error);
 }
 
 /*
     struct vop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred = ap->a_cred;
 	int fflag = ap->a_fflag;
 	struct thread *td = ap->a_td;
 	pid_t pid = td->td_proc->p_pid;
 	int err = 0;
 
 	if (fuse_isdeadfs(vp))
 		return 0;
 	if (vnode_isdir(vp))
 		return 0;
 	if (fflag & IO_NDELAY)
 		return 0;
 
 	err = fuse_flush(vp, cred, pid, fflag);
 	/* TODO: close the file handle, if we're sure it's no longer used */
 	if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 		fuse_vnode_savesize(vp, cred, td->td_proc->p_pid);
 	}
 	return err;
 }
 
 static void
 fdisp_make_mknod_for_fallback(
 	struct fuse_dispatcher *fdip,
 	struct componentname *cnp,
 	struct vnode *dvp,
 	uint64_t parentnid,
 	struct thread *td,
 	struct ucred *cred,
 	mode_t mode,
 	enum fuse_opcode *op)
 {
 	struct fuse_mknod_in *fmni;
 
 	fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1);
 	*op = FUSE_MKNOD;
 	fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred);
 	fmni = fdip->indata;
 	fmni->mode = mode;
 	fmni->rdev = 0;
 	memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr,
 	    cnp->cn_namelen);
 	((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0';
 }
 /*
     struct vnop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	struct fuse_data *data;
 	struct fuse_create_in *fci;
 	struct fuse_entry_out *feo;
 	struct fuse_open_out *foo;
 	struct fuse_dispatcher fdi, fdi2;
 	struct fuse_dispatcher *fdip = &fdi;
 	struct fuse_dispatcher *fdip2 = NULL;
 
 	int err;
 
 	struct mount *mp = vnode_mount(dvp);
 	data = fuse_get_mpdata(mp);
 	uint64_t parentnid = VTOFUD(dvp)->nid;
 	mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	enum fuse_opcode op;
 	int flags;
 
 	if (fuse_isdeadfs(dvp))
 		return ENXIO;
 
 	/* FUSE expects sockets to be created with FUSE_MKNOD */
 	if (vap->va_type == VSOCK)
 		return fuse_internal_mknod(dvp, vpp, cnp, vap);
 
 	/* 
 	 * VOP_CREATE doesn't tell us the open(2) flags, so we guess.  Only a
 	 * writable mode makes sense, and we might as well include readability
 	 * too.
 	 */
 	flags = O_RDWR;
 
 	bzero(&fdi, sizeof(fdi));
 
 	if (vap->va_type != VREG)
 		return (EINVAL);
 
 	if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) {
 		/* Fallback to FUSE_MKNOD/FUSE_OPEN */
 		fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td,
 			cred, mode, &op);
 	} else {
 		/* Use FUSE_CREATE */
 		size_t insize;
 
 		op = FUSE_CREATE;
 		fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1);
 		fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred);
 		fci = fdip->indata;
 		fci->mode = mode;
 		fci->flags = O_CREAT | flags;
 		if (fuse_libabi_geq(data, 7, 12)) {
 			insize = sizeof(*fci);
 			fci->umask = td->td_proc->p_fd->fd_cmask;
 		} else {
 			insize = sizeof(struct fuse_open_in);
 		}
 
 		memcpy((char *)fdip->indata + insize, cnp->cn_nameptr,
 		    cnp->cn_namelen);
 		((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0';
 	}
 
 	err = fdisp_wait_answ(fdip);
 
 	if (err) {
 		if (err == ENOSYS && op == FUSE_CREATE) {
 			fsess_set_notimpl(mp, FUSE_CREATE);
 			fdisp_destroy(fdip);
 			fdisp_make_mknod_for_fallback(fdip, cnp, dvp,
 				parentnid, td, cred, mode, &op);
 			err = fdisp_wait_answ(fdip);
 		}
 		if (err)
 			goto out;
 	}
 
 	feo = fdip->answ;
 
 	if ((err = fuse_internal_checkentry(feo, vap->va_type))) {
 		goto out;
 	}
 
 	if (op == FUSE_CREATE) {
 		foo = (struct fuse_open_out*)(feo + 1);
 	} else {
 		/* Issue a separate FUSE_OPEN */
 		struct fuse_open_in *foi;
 
 		fdip2 = &fdi2;
 		fdisp_init(fdip2, sizeof(*foi));
 		fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td,
 			cred);
 		foi = fdip2->indata;
 		foi->flags = flags;
 		err = fdisp_wait_answ(fdip2);
 		if (err)
 			goto out;
 		foo = fdip2->answ;
 	}
 	err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type);
 	if (err) {
 		struct fuse_release_in *fri;
 		uint64_t nodeid = feo->nodeid;
 		uint64_t fh_id = foo->fh;
 
 		fdisp_init(fdip, sizeof(*fri));
 		fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
 		fri = fdip->indata;
 		fri->fh = fh_id;
 		fri->flags = flags;
 		fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
 		fuse_insert_message(fdip->tick, false);
 		goto out;
 	}
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
 	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
 		feo->attr_valid_nsec, NULL);
 
 	fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo);
 	fuse_vnode_open(*vpp, foo->open_flags, td);
 	/* 
 	 * Purge the parent's attribute cache because the daemon should've
 	 * updated its mtime and ctime
 	 */
 	fuse_vnode_clear_attr_cache(dvp);
 	cache_purge_negative(dvp);
 
 out:
 	if (fdip2)
 		fdisp_destroy(fdip2);
 	fdisp_destroy(fdip);
 	return err;
 }
 
 /*
     struct vnop_fdatasync_args {
 	struct vop_generic_args a_gen;
 	struct vnode * a_vp;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fdatasync(struct vop_fdatasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	int waitfor = MNT_WAIT;
 
 	int err = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfdatasync_buf(ap)))
 		return err;
 
 	return fuse_internal_fsync(vp, td, waitfor, true);
 }
 
 /*
     struct vnop_fsync_args {
 	struct vop_generic_args a_gen;
 	struct vnode * a_vp;
 	int  a_waitfor;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	int waitfor = ap->a_waitfor;
 	int err = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfsync(ap)))
 		return err;
 
 	return fuse_internal_fsync(vp, td, waitfor, false);
 }
 
 /*
     struct vnop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 
 	int err = 0;
 	int dataflags;
 
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	/* Note that we are not bailing out on a dead file system just yet. */
 
 	if (!(dataflags & FSESS_INITED)) {
 		if (!vnode_isvroot(vp)) {
 			fdata_set_dead(fuse_get_mpdata(vnode_mount(vp)));
 			err = ENOTCONN;
 			return err;
 		} else {
 			goto fake;
 		}
 	}
 	err = fuse_internal_getattr(vp, vap, cred, td);
 	if (err == ENOTCONN && vnode_isvroot(vp)) {
 		/* see comment in fuse_vfsop_statfs() */
 		goto fake;
 	} else {
 		return err;
 	}
 
 fake:
 	bzero(vap, sizeof(*vap));
 	vap->va_type = vnode_vtype(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh, *fufh_tmp;
 
 	int need_flush = 1;
 
 	LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
 		if (need_flush && vp->v_type == VREG) {
 			if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 				fuse_vnode_savesize(vp, NULL, 0);
 			}
 			if ((fvdat->flag & FN_REVOKED) != 0)
 				fuse_io_invalbuf(vp, td);
 			else
 				fuse_io_flushbuf(vp, MNT_WAIT, td);
 			need_flush = 0;
 		}
 		fuse_filehandle_close(vp, fufh, td, NULL);
 	}
 
 	if ((fvdat->flag & FN_REVOKED) != 0)
 		vrecycle(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 
 	struct vattr *vap = VTOVA(vp);
 
 	struct fuse_dispatcher fdi;
 	struct fuse_entry_out *feo;
 	struct fuse_link_in fli;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_mount(tdvp) != vnode_mount(vp)) {
 		return EXDEV;
 	}
 
 	/*
 	 * This is a seatbelt check to protect naive userspace filesystems from
 	 * themselves and the limitations of the FUSE IPC protocol.  If a
 	 * filesystem does not allow attribute caching, assume it is capable of
 	 * validating that nlink does not overflow.
 	 */
 	if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX)
 		return EMLINK;
 	fli.oldnodeid = VTOI(vp);
 
 	fdisp_init(&fdi, 0);
 	fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp,
 	    FUSE_LINK, &fli, sizeof(fli), &fdi);
 	if ((err = fdisp_wait_answ(&fdi))) {
 		goto out;
 	}
 	feo = fdi.answ;
 
 	err = fuse_internal_checkentry(feo, vnode_vtype(vp));
 	if (!err) {
 		/* 
 		 * Purge the parent's attribute cache because the daemon
 		 * should've updated its mtime and ctime
 		 */
 		fuse_vnode_clear_attr_cache(tdvp);
 		fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid,
 			feo->attr_valid_nsec, NULL);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 struct fuse_lookup_alloc_arg {
 	struct fuse_entry_out *feo;
 	struct componentname *cnp;
 	uint64_t nid;
 	enum vtype vtyp;
 };
 
 /* Callback for vn_get_ino */
 static int
 fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	struct fuse_lookup_alloc_arg *flaa = arg;
 
 	return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp,
 		flaa->vtyp);
 }
 
 SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup,
 	"int", "struct timespec*", "struct timespec*");
 /*
     struct vnop_lookup_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
     };
 */
 int
 fuse_vnop_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	int nameiop = cnp->cn_nameiop;
 	int flags = cnp->cn_flags;
 	int wantparent = flags & (LOCKPARENT | WANTPARENT);
 	int islastcn = flags & ISLASTCN;
 	struct mount *mp = vnode_mount(dvp);
 
 	int err = 0;
 	int lookup_err = 0;
 	struct vnode *vp = NULL;
 
 	struct fuse_dispatcher fdi;
 	bool did_lookup = false;
 	struct fuse_entry_out *feo = NULL;
 	enum vtype vtyp;	/* vnode type of target */
 	off_t filesize;		/* filesize of target */
 
 	uint64_t nid;
 
 	if (fuse_isdeadfs(dvp)) {
 		*vpp = NULL;
 		return ENXIO;
 	}
 	if (!vnode_isdir(dvp))
 		return ENOTDIR;
 
 	if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP))
 		return EROFS;
 
 	if ((err = fuse_internal_access(dvp, VEXEC, td, cred)))
 		return err;
 
 	if (flags & ISDOTDOT) {
 		KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID,
 			("Looking up .. is TODO"));
 		nid = VTOFUD(dvp)->parent_nid;
 		if (nid == 0)
 			return ENOENT;
 		/* .. is obviously a directory */
 		vtyp = VDIR;
 		filesize = 0;
 	} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
 		nid = VTOI(dvp);
 		/* . is obviously a directory */
 		vtyp = VDIR;
 		filesize = 0;
 	} else {
 		struct timespec now, timeout;
 
 		err = cache_lookup(dvp, vpp, cnp, &timeout, NULL);
 		getnanouptime(&now);
 		SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now);
 		switch (err) {
 		case -1:		/* positive match */
 			if (timespeccmp(&timeout, &now, >)) {
 				counter_u64_add(fuse_lookup_cache_hits, 1);
 			} else {
 				/* Cache timeout */
 				counter_u64_add(fuse_lookup_cache_misses, 1);
 				bintime_clear(
 					&VTOFUD(*vpp)->entry_cache_timeout);
 				cache_purge(*vpp);
 				if (dvp != *vpp)
 					vput(*vpp);
 				else 
 					vrele(*vpp);
 				*vpp = NULL;
 				break;
 			}
 			return 0;
 
 		case 0:		/* no match in cache */
 			counter_u64_add(fuse_lookup_cache_misses, 1);
 			break;
 
 		case ENOENT:		/* negative match */
 			getnanouptime(&now);
 			if (timespeccmp(&timeout, &now, <=)) {
 				/* Cache timeout */
 				cache_purge_negative(dvp);
 				break;
 			}
 			/* fall through */
 		default:
 			return err;
 		}
 
 		nid = VTOI(dvp);
 		fdisp_init(&fdi, cnp->cn_namelen + 1);
 		fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred);
 
 		memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 		((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 		lookup_err = fdisp_wait_answ(&fdi);
 		did_lookup = true;
 
 		if (!lookup_err) {
 			/* lookup call succeeded */
 			feo = (struct fuse_entry_out *)fdi.answ;
 			nid = feo->nodeid;
 			if (nid == 0) {
 				/* zero nodeid means ENOENT and cache it */
 				struct timespec timeout;
 
 				fdi.answ_stat = ENOENT;
 				lookup_err = ENOENT;
 				if (cnp->cn_flags & MAKEENTRY) {
 					fuse_validity_2_timespec(feo, &timeout);
 					cache_enter_time(dvp, *vpp, cnp,
 						&timeout, NULL);
 				}
 			} else if (nid == FUSE_ROOT_ID) {
 				lookup_err = EINVAL;
 			}
 			vtyp = IFTOVT(feo->attr.mode);
 			filesize = feo->attr.size;
 		}
 		if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) {
 			fdisp_destroy(&fdi);
 			return lookup_err;
 		}
 	}
 	/* lookup_err, if non-zero, must be ENOENT at this point */
 
 	if (lookup_err) {
 		/* Entry not found */
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
 			err = fuse_internal_access(dvp, VWRITE, td, cred);
 			if (!err) {
 				/*
 				 * Set the SAVENAME flag to hold onto the
 				 * pathname for use later in VOP_CREATE or
 				 * VOP_RENAME.
 				 */
 				cnp->cn_flags |= SAVENAME;
 
 				err = EJUSTRETURN;
 			}
 		} else {
 			err = ENOENT;
 		}
 	} else {
 		/* Entry was found */
 		if (flags & ISDOTDOT) {
 			struct fuse_lookup_alloc_arg flaa;
 
 			flaa.nid = nid;
 			flaa.feo = feo;
 			flaa.cnp = cnp;
 			flaa.vtyp = vtyp;
 			err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0,
 				&vp);
 			*vpp = vp;
 		} else if (nid == VTOI(dvp)) {
 			vref(dvp);
 			*vpp = dvp;
 		} else {
 			struct fuse_vnode_data *fvdat;
 
 			err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
 			    &vp, cnp, vtyp);
 			if (err)
 				goto out;
 			*vpp = vp;
 
 			/*
 			 * In the case where we are looking up a FUSE node
 			 * represented by an existing cached vnode, and the
 			 * true size reported by FUSE_LOOKUP doesn't match
 			 * the vnode's cached size, then any cached writes
 			 * beyond the file's current size are lost.
 			 *
 			 * We can get here:
 			 * * following attribute cache expiration, or
 			 * * due a bug in the daemon, or
 			 */
 			fvdat = VTOFUD(vp);
 			if (vnode_isreg(vp) &&
 			    filesize != fvdat->cached_attrs.va_size &&
 			    fvdat->flag & FN_SIZECHANGE) {
 				/*
 				 * The FN_SIZECHANGE flag reflects a dirty
 				 * append.  If userspace lets us know our cache
 				 * is invalid, that write was lost.  (Dirty
 				 * writes that do not cause append are also
 				 * lost, but we don't detect them here.)
 				 *
 				 * XXX: Maybe disable WB caching on this mount.
 				 */
 				printf("%s: WB cache incoherent on %s!\n",
 				    __func__,
 				    vnode_mount(vp)->mnt_stat.f_mntonname);
 
 				fvdat->flag &= ~FN_SIZECHANGE;
 			}
 
 			MPASS(feo != NULL);
 			fuse_internal_cache_attrs(*vpp, &feo->attr,
 				feo->attr_valid, feo->attr_valid_nsec, NULL);
 			fuse_validity_2_bintime(feo->entry_valid,
 				feo->entry_valid_nsec,
 				&fvdat->entry_cache_timeout);
 
 			if ((nameiop == DELETE || nameiop == RENAME) &&
 				islastcn)
 			{
 				struct vattr dvattr;
 
 				err = fuse_internal_access(dvp, VWRITE, td,
 					cred);
 				if (err != 0)
 					goto out;
 				/* 
 				 * if the parent's sticky bit is set, check
 				 * whether we're allowed to remove the file.
 				 * Need to figure out the vnode locking to make
 				 * this work.
 				 */
 				fuse_internal_getattr(dvp, &dvattr, cred, td);
 				if ((dvattr.va_mode & S_ISTXT) &&
 					fuse_internal_access(dvp, VADMIN, td,
 						cred) &&
 					fuse_internal_access(*vpp, VADMIN, td,
 						cred)) {
 					err = EPERM;
 					goto out;
 				}
 			}
 
 			if (islastcn && (
 				(nameiop == DELETE) ||
 				(nameiop == RENAME && wantparent))) {
 				cnp->cn_flags |= SAVENAME;
 			}
 
 		}
 	}
 out:
 	if (err) {
 		if (vp != NULL && dvp != vp)
 			vput(vp);
 		else if (vp != NULL)
 			vrele(vp);
 		*vpp = NULL;
 	}
 	if (did_lookup)
 		fdisp_destroy(&fdi);
 
 	return err;
 }
 
 /*
     struct vnop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	struct fuse_mkdir_in fmdi;
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	fmdi.umask = curthread->td_proc->p_fd->fd_cmask;
 
 	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
 	    sizeof(fmdi), VDIR));
 }
 
 /*
     struct vnop_mknod_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mknod(struct vop_mknod_args *ap)
 {
 
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	if (fuse_isdeadfs(dvp))
 		return ENXIO;
 
 	return fuse_internal_mknod(dvp, vpp, cnp, vap);
 }
 
 /*
     struct vop_open_args {
 	struct vnode *a_vp;
 	int  a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx; / struct file *a_fp;
     };
 */
 static int
 fuse_vnop_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int a_mode = ap->a_mode;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = td->td_proc->p_pid;
 	struct fuse_vnode_data *fvdat;
 
 	if (fuse_isdeadfs(vp))
 		return ENXIO;
 	if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO)
 		return (EOPNOTSUPP);
 	if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0)
 		return EINVAL;
 
 	fvdat = VTOFUD(vp);
 
 	if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) {
 		fuse_vnode_open(vp, 0, td);
 		return 0;
 	}
 
 	return fuse_filehandle_open(vp, a_mode, NULL, td, cred);
 }
 
 static int
 fuse_vnop_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX);
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
     struct vnop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
 }
 
 /*
     struct vnop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
     };
 */
 static int
 fuse_vnop_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_iov cookediov;
 	int err = 0;
 	u_long *cookies;
 	off_t startoff;
 	ssize_t tresid;
 	int ncookies;
 	bool closefufh = false;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (ap->a_eofflag)
 		*ap->a_eofflag = 0;
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (				/* XXXIP ((uio_iovcnt(uio) > 1)) || */
 	    (uio_resid(uio) < sizeof(struct dirent))) {
 		return EINVAL;
 	}
 
 	tresid = uio->uio_resid;
 	startoff = uio->uio_offset;
 	err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
 	if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
 		/* 
 		 * nfsd will do VOP_READDIR without first doing VOP_OPEN.  We
 		 * must implicitly open the directory here
 		 */
 		err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
 		if (err == 0) {
 			/*
 			 * When a directory is opened, it must be read from
 			 * the beginning.  Hopefully, the "startoff" still
 			 * exists as an offset cookie for the directory.
 			 * If not, it will read the entire directory without
 			 * returning any entries and just return eof.
 			 */
 			uio->uio_offset = 0;
 		}
 		closefufh = true;
 	}
 	if (err)
 		return (err);
 	if (ap->a_ncookies != NULL) {
 		ncookies = uio->uio_resid /
 			(offsetof(struct dirent, d_name) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
 	fiov_init(&cookediov, DIRCOOKEDSIZE);
 
 	err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
 		&ncookies, cookies);
 
 	fiov_teardown(&cookediov);
 	if (closefufh)
 		fuse_filehandle_close(vp, fufh, curthread, cred);
 
 	if (ap->a_ncookies != NULL) {
 		if (err == 0) {
 			*ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (err == 0 && tresid == uio->uio_resid)
 		*ap->a_eofflag = 1;
 
 	return err;
 }
 
 /*
     struct vnop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_dispatcher fdi;
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (!vnode_islnk(vp)) {
 		return EINVAL;
 	}
 	fdisp_init(&fdi, 0);
 	err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred);
 	if (err) {
 		goto out;
 	}
 	if (((char *)fdi.answ)[0] == '/' &&
 	    fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) {
 		char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname;
 
 		err = uiomove(mpth, strlen(mpth), uio);
 	}
 	if (!err) {
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh, *fufh_tmp;
 
 	if (!fvdat) {
 		panic("FUSE: no vnode data during recycling");
 	}
 	LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
 		printf("FUSE: vnode being reclaimed with open fufh "
 			"(type=%#x)", fufh->fufh_type);
 		fuse_filehandle_close(vp, fufh, td, NULL);
 	}
 
 	if (!fuse_isdeadfs(vp) && fvdat->nlookup > 0) {
 		fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp),
 		    fvdat->nlookup);
 	}
 	cache_purge(vp);
 	vfs_hash_remove(vp);
 	fuse_vnode_destroy(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_isdir(vp)) {
 		return EPERM;
 	}
 
 	err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
 
 	return err;
 }
 
 /*
     struct vnop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
     };
 */
 static int
 fuse_vnop_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct fuse_data *data;
 	bool newparent = fdvp != tdvp;
 	bool isdir = fvp->v_type == VDIR;
 	int err = 0;
 
 	if (fuse_isdeadfs(fdvp)) {
 		return ENXIO;
 	}
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename");
 		err = EXDEV;
 		goto out;
 	}
 	cache_purge(fvp);
 
 	/*
 	 * FUSE library is expected to check if target directory is not
 	 * under the source directory in the file system tree.
 	 * Linux performs this check at VFS level.
 	 */
 	/* 
 	 * If source is a directory, and it will get a new parent, user must
 	 * have write permission to it, so ".." can be modified.
 	 */
 	data = fuse_get_mpdata(vnode_mount(tdvp));
 	if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) {
 		err = fuse_internal_access(fvp, VWRITE,
 			tcnp->cn_thread, tcnp->cn_cred);
 		if (err)
 			goto out;
 	}
 	sx_xlock(&data->rename_lock);
 	err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
 	if (err == 0) {
 		if (tdvp != fdvp)
 			fuse_vnode_setparent(fvp, tdvp);
 		if (tvp != NULL)
 			fuse_vnode_setparent(tvp, NULL);
 	}
 	sx_unlock(&data->rename_lock);
 
 	if (tvp != NULL && tvp != fvp) {
 		cache_purge(tvp);
 	}
 	if (vnode_isdir(fvp)) {
 		if ((tvp != NULL) && vnode_isdir(tvp)) {
 			cache_purge(tdvp);
 		}
 		cache_purge(fdvp);
 	}
 out:
 	if (tdvp == tvp) {
 		vrele(tdvp);
 	} else {
 		vput(tdvp);
 	}
 	if (tvp != NULL) {
 		vput(tvp);
 	}
 	vrele(fdvp);
 	vrele(fvp);
 
 	return err;
 }
 
 /*
     struct vnop_rmdir_args {
 	    struct vnode *a_dvp;
 	    struct vnode *a_vp;
 	    struct componentname *a_cnp;
     } *ap;
 */
 static int
 fuse_vnop_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (VTOFUD(vp) == VTOFUD(dvp)) {
 		return EINVAL;
 	}
 	err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
 
 	return err;
 }
 
 /*
     struct vnop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	struct mount *mp;
 	struct fuse_data *data;
 	struct vattr old_va;
 	int dataflags;
 	int err = 0, err2;
 	accmode_t accmode = 0;
 	bool checkperm;
 	bool drop_suid = false;
 	gid_t cr_gid;
 
 	mp = vnode_mount(vp);
 	data = fuse_get_mpdata(mp);
 	dataflags = data->dataflags;
 	checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS;
 	if (cred->cr_ngroups > 0)
 		cr_gid = cred->cr_groups[0];
 	else
 		cr_gid = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		if (checkperm) {
 			/* Only root may change a file's owner */
 			err = priv_check_cred(cred, PRIV_VFS_CHOWN);
 			if (err) {
 				/* As a special case, allow the null chown */
 				err2 = fuse_internal_getattr(vp, &old_va, cred,
 					td);
 				if (err2)
 					return (err2);
 				if (vap->va_uid != old_va.va_uid)
 					return err;
 				else
 					accmode |= VADMIN;
 				drop_suid = true;
 			} else
 				accmode |= VADMIN;
 		} else
 			accmode |= VADMIN;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN))
 			drop_suid = true;
 		if (checkperm && !groupmember(vap->va_gid, cred))
 		{
 			/*
 			 * Non-root users may only chgrp to one of their own
 			 * groups 
 			 */
 			err = priv_check_cred(cred, PRIV_VFS_CHOWN);
 			if (err) {
 				/* As a special case, allow the null chgrp */
 				err2 = fuse_internal_getattr(vp, &old_va, cred,
 					td);
 				if (err2)
 					return (err2);
 				if (vap->va_gid != old_va.va_gid)
 					return err;
 				accmode |= VADMIN;
 			} else
 				accmode |= VADMIN;
 		} else
 			accmode |= VADMIN;
 	}
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vfs_isrdonly(mp))
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		/* Don't set accmode.  Permission to trunc is checked upstack */
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vap->va_vaflags & VA_UTIMES_NULL)
 			accmode |= VWRITE;
 		else
 			accmode |= VADMIN;
 	}
 	if (drop_suid) {
 		if (vap->va_mode != (mode_t)VNOVAL)
 			vap->va_mode &= ~(S_ISUID | S_ISGID);
 		else {
 			err = fuse_internal_getattr(vp, &old_va, cred, td);
 			if (err)
 				return (err);
 			vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID);
 		}
 	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		/* Only root may set the sticky bit on non-directories */
 		if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT)
 		    && priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return EFTYPE;
 		if (checkperm && (vap->va_mode & S_ISGID)) {
 			err = fuse_internal_getattr(vp, &old_va, cred, td);
 			if (err)
 				return (err);
 			if (!groupmember(old_va.va_gid, cred)) {
 				err = priv_check_cred(cred, PRIV_VFS_SETGID);
 				if (err)
 					return (err);
 			}
 		}
 		accmode |= VADMIN;
 	}
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	err = fuse_internal_access(vp, accmode, td, cred);
 	if (err)
 		return err;
 	else
 		return fuse_internal_setattr(vp, vap, td, cred);
 }
 
 /*
     struct vnop_strategy_args {
 	struct vnode *a_vp;
 	struct buf *a_bp;
     };
 */
 static int
 fuse_vnop_strategy(struct vop_strategy_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
 
 	if (!vp || fuse_isdeadfs(vp)) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = ENXIO;
 		bufdone(bp);
 		return 0;
 	}
 
 	/*
 	 * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags.
 	 * fuse_io_strategy sets bp's error fields
 	 */
 	(void)fuse_io_strategy(vp, bp);
 
 	return 0;
 }
 
 
 /*
     struct vnop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
     };
 */
 static int
 fuse_vnop_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	const char *target = ap->a_target;
 
 	struct fuse_dispatcher fdi;
 
 	int err;
 	size_t len;
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	/*
 	 * Unlike the other creator type calls, here we have to create a message
 	 * where the name of the new entry comes first, and the data describing
 	 * the entry comes second.
 	 * Hence we can't rely on our handy fuse_internal_newentry() routine,
 	 * but put together the message manually and just call the core part.
 	 */
 
 	len = strlen(target) + 1;
 	fdisp_init(&fdi, len + cnp->cn_namelen + 1);
 	fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL);
 
 	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len);
 
 	err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_write(struct vop_write_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
 }
 
 static daddr_t
 fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 	const int biosize = fuse_iosize(vp);
 
 	return (off / biosize);
 }
 
 static int
 fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	off_t filesize;
 	int blksz, err;
 	const int biosize = fuse_iosize(vp);
 
 	err = fuse_vnode_size(vp, &filesize, NULL, NULL);
 	KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here"));
 	if (err)
 		return biosize;
 
 	if ((off_t)lbn * biosize >= filesize) {
 		blksz = 0;
 	} else if ((off_t)(lbn + 1) * biosize > filesize) {
 		blksz = filesize - (off_t)lbn *biosize;
 	} else {
 		blksz = biosize;
 	}
 	return (blksz);
 }
 
 /*
     struct vnop_getpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_reqpage;
     };
 */
 static int
 fuse_vnop_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
 		SDT_PROBE2(fusefs, , vnops, trace, 1,
 			"called on non-cacheable vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz));
 }
 
 static const char extattr_namespace_separator = '.';
 
 /*
     struct vop_getextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getextattr(struct vop_getextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_getxattr_in *get_xattr_in;
 	struct fuse_getxattr_out *get_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	char *attr_str;
 	size_t len;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_GETXATTR))
 		return EOPNOTSUPP;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*get_xattr_in));
 	fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred);
 
 	get_xattr_in = fdi.indata;
 	/*
 	 * Check to see whether we're querying the available size or
 	 * issuing the actual request.  If we pass in 0, we get back struct
 	 * fuse_getxattr_out.  If we pass in a non-zero size, we get back
 	 * that much data, without the struct fuse_getxattr_out header.
 	 */
 	if (uio == NULL)
 		get_xattr_in->size = 0;
 	else
 		get_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*get_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS) {
 			fsess_set_notimpl(mp, FUSE_GETXATTR);
 			err = EOPNOTSUPP;
 		}
 		goto out;
 	}
 
 	get_xattr_out = fdi.answ;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = get_xattr_out->size;
 
 	if (uio != NULL)
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_setextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setextattr(struct vop_setextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_setxattr_in *set_xattr_in;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 	
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_SETXATTR))
 		return EOPNOTSUPP;
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	/* Deleting xattrs must use VOP_DELETEEXTATTR instead */
 	if (ap->a_uio == NULL) {
 		/*
 		 * If we got here as fallback from VOP_DELETEEXTATTR, then
 		 * return EOPNOTSUPP.
 		 */
 		if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
 			return (EOPNOTSUPP);
 		else
 			return (EINVAL);
 	}
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
 		VWRITE);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid);
 	fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred);
 
 	set_xattr_in = fdi.indata;
 	set_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*set_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len,
 	    uio->uio_resid, uio);
 	if (err != 0) {
 		goto out;
 	}
 
 	err = fdisp_wait_answ(&fdi);
 
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_SETXATTR);
 		err = EOPNOTSUPP;
 	}
 	if (err == ERESTART) {
 		/* Can't restart after calling uiomove */
 		err = EINTR;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
  * The Linux / FUSE extended attribute list is simply a collection of
  * NUL-terminated strings.  The FreeBSD extended attribute list is a single
  * byte length followed by a non-NUL terminated string.  So, this allows
  * conversion of the Linux / FUSE format to the FreeBSD format in place.
  * Linux attribute names are reported with the namespace as a prefix (e.g.
  * "user.attribute_name"), but in FreeBSD they are reported without the
  * namespace prefix (e.g. "attribute_name").  So, we're going from:
  *
  * user.attr_name1\0user.attr_name2\0
  *
  * to:
  *
  * <num>attr_name1<num>attr_name2
  *
  * Where "<num>" is a single byte number of characters in the attribute name.
  * 
  * Args:
  * prefix - exattr namespace prefix string
  * list, list_len - input list with namespace prefixes
  * bsd_list, bsd_list_len - output list compatible with bsd vfs
  */
 static int
 fuse_xattrlist_convert(char *prefix, const char *list, int list_len,
     char *bsd_list, int *bsd_list_len)
 {
 	int len, pos, dist_to_next, prefix_len;
 
 	pos = 0;
 	*bsd_list_len = 0;
 	prefix_len = strlen(prefix);
 
 	while (pos < list_len && list[pos] != '\0') {
 		dist_to_next = strlen(&list[pos]) + 1;
 		if (bcmp(&list[pos], prefix, prefix_len) == 0 &&
 		    list[pos + prefix_len] == extattr_namespace_separator) {
 			len = dist_to_next -
 			    (prefix_len + sizeof(extattr_namespace_separator)) - 1;
 			if (len >= EXTATTR_MAXNAMELEN)
 				return (ENAMETOOLONG);
 
 			bsd_list[*bsd_list_len] = len;
 			memcpy(&bsd_list[*bsd_list_len + 1],
 			    &list[pos + prefix_len +
 			    sizeof(extattr_namespace_separator)], len);
 
 			*bsd_list_len += len + 1;
 		}
 
 		pos += dist_to_next;
 	}
 
 	return (0);
 }
 
 /*
  * List extended attributes
  *
  * The FUSE_LISTXATTR operation is based on Linux's listxattr(2) syscall, which
  * has a number of differences compared to its FreeBSD equivalent,
  * extattr_list_file:
  *
  * - FUSE_LISTXATTR returns all extended attributes across all namespaces,
  *   whereas listxattr(2) only returns attributes for a single namespace
  * - FUSE_LISTXATTR prepends each attribute name with "namespace."
  * - If the provided buffer is not large enough to hold the result,
  *   FUSE_LISTXATTR should return ERANGE, whereas listxattr is expected to
  *   return as many results as will fit.
  */
 /*
     struct vop_listextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_listextattr(struct vop_listextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_listxattr_in *list_xattr_in;
 	struct fuse_listxattr_out *list_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	char *bsd_list = NULL;
 	char *linux_list;
 	int bsd_list_len;
 	int linux_list_len;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_LISTXATTR))
 		return EOPNOTSUPP;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
 	if (err)
 		return err;
 
 	/*
 	 * Add space for a NUL and the period separator if enabled.
 	 * Default to looking for user attributes.
 	 */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	fdisp_init(&fdi, sizeof(*list_xattr_in));
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 
 	/*
 	 * Retrieve Linux / FUSE compatible list size.
 	 */
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = 0;
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS) {
 			fsess_set_notimpl(mp, FUSE_LISTXATTR);
 			err = EOPNOTSUPP;
 		}
 		goto out;
 	}
 
 	list_xattr_out = fdi.answ;
 	linux_list_len = list_xattr_out->size;
 	if (linux_list_len == 0) {
 		if (ap->a_size != NULL)
 			*ap->a_size = linux_list_len;
 		goto out;
 	}
 
 	/*
 	 * Retrieve Linux / FUSE compatible list values.
 	 */
 	fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = linux_list_len;
 
 	err = fdisp_wait_answ(&fdi);
 	if (err == ERANGE) {
 		/* 
 		 * Race detected.  The attribute list must've grown since the
 		 * first FUSE_LISTXATTR call.  Start over.  Go all the way back
 		 * to userland so we can process signals, if necessary, before
 		 * restarting.
 		 */
 		err = ERESTART;
 		goto out;
 	} else if (err != 0)
 		goto out;
 
 	linux_list = fdi.answ;
 	/* FUSE doesn't allow the server to return more data than requested */
 	if (fdi.iosize > linux_list_len) {
 		printf("WARNING: FUSE protocol violation.  Server returned "
 			"more extended attribute data than requested; "
 			"should've returned ERANGE instead");
 	} else {
 		/* But returning less data is fine */
 		linux_list_len = fdi.iosize;
 	}
 
 	/*
 	 * Retrieve the BSD compatible list values.
 	 * The Linux / FUSE attribute list format isn't the same
 	 * as FreeBSD's format. So we need to transform it into
 	 * FreeBSD's format before giving it to the user.
 	 */
 	bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK);
 	err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len,
 	    bsd_list, &bsd_list_len);
 	if (err != 0)
 		goto out;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = bsd_list_len;
 
 	if (uio != NULL)
 		err = uiomove(bsd_list, bsd_list_len, uio);
 
 out:
 	free(bsd_list, M_TEMP);
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_deleteextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_dispatcher fdi;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
 		return EOPNOTSUPP;
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
 		VWRITE);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len);
 	fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred);
 
 	attr_str = fdi.indata;
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
 		err = EOPNOTSUPP;
 	}
 
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vnop_print_args {
 	struct vnode *a_vp;
     };
 */
 static int
 fuse_vnop_print(struct vop_print_args *ap)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp);
 
 	printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n",
 	    (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid,
 	    (uintmax_t)fvdat->nlookup,
 	    fvdat->flag);
 
 	return 0;
 }
 	
 /*
  * Get an NFS filehandle for a FUSE file.
  *
  * This will only work for FUSE file systems that guarantee the uniqueness of
  * nodeid:generation, which most don't.
  */
 /*
 vop_vptofh {
 	IN struct vnode *a_vp;
 	IN struct fid *a_fhp;
 };
 */
 static int
 fuse_vnop_vptofh(struct vop_vptofh_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp);
 	_Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid),
 		"FUSE fid type is too big");
 	struct mount *mp = vnode_mount(vp);
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	struct vattr va;
 	int err;
 
 	if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
 		return EOPNOTSUPP;
 
 	err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
 	if (err)
 		return err;
 
 	/*ip = VTOI(ap->a_vp);*/
 	/*ufhp = (struct ufid *)ap->a_fhp;*/
 	fhp->len = sizeof(struct fuse_fid);
 	fhp->nid = fvdat->nid;
 	if (fvdat->generation <= UINT32_MAX)
 		fhp->gen = fvdat->generation;
 	else
 		return EOVERFLOW;
 	return (0);
 }
 
 
Index: head/sys/fs/msdosfs/msdosfs_vnops.c
===================================================================
--- head/sys/fs/msdosfs/msdosfs_vnops.c	(revision 355789)
+++ head/sys/fs/msdosfs/msdosfs_vnops.c	(revision 355790)
@@ -1,1964 +1,1965 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vnops.c,v 1.68 1998/02/10 14:10:04 mrg Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/clock.h>
 #include <sys/dirent.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 /*
  * Prototypes for MSDOSFS vnode operations
  */
 static vop_create_t	msdosfs_create;
 static vop_mknod_t	msdosfs_mknod;
 static vop_open_t	msdosfs_open;
 static vop_close_t	msdosfs_close;
 static vop_access_t	msdosfs_access;
 static vop_getattr_t	msdosfs_getattr;
 static vop_setattr_t	msdosfs_setattr;
 static vop_read_t	msdosfs_read;
 static vop_write_t	msdosfs_write;
 static vop_fsync_t	msdosfs_fsync;
 static vop_remove_t	msdosfs_remove;
 static vop_link_t	msdosfs_link;
 static vop_rename_t	msdosfs_rename;
 static vop_mkdir_t	msdosfs_mkdir;
 static vop_rmdir_t	msdosfs_rmdir;
 static vop_symlink_t	msdosfs_symlink;
 static vop_readdir_t	msdosfs_readdir;
 static vop_bmap_t	msdosfs_bmap;
 static vop_getpages_t	msdosfs_getpages;
 static vop_strategy_t	msdosfs_strategy;
 static vop_print_t	msdosfs_print;
 static vop_pathconf_t	msdosfs_pathconf;
 static vop_vptofh_t	msdosfs_vptofh;
 
 /*
  * Some general notes:
  *
  * In the ufs filesystem the inodes, superblocks, and indirect blocks are
  * read/written using the vnode for the filesystem. Blocks that represent
  * the contents of a file are read/written using the vnode for the file
  * (including directories when they are read/written as files). This
  * presents problems for the dos filesystem because data that should be in
  * an inode (if dos had them) resides in the directory itself.  Since we
  * must update directory entries without the benefit of having the vnode
  * for the directory we must use the vnode for the filesystem.  This means
  * that when a directory is actually read/written (via read, write, or
  * readdir, or seek) we must use the vnode for the filesystem instead of
  * the vnode for the directory as would happen in ufs. This is to insure we
  * retrieve the correct block from the buffer cache since the hash value is
  * based upon the vnode address and the desired block number.
  */
 
 /*
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 msdosfs_create(struct vop_create_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode ndirent;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct timespec ts;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_create(cnp %p, vap %p\n", cnp, ap->a_vap);
 #endif
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad;
 	}
 
 	/*
 	 * Create a directory entry for the file, then call createde() to
 	 * have it installed. NOTE: DOS files are always executable.  We
 	 * use the absence of the owner write bit to make the file
 	 * readonly.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_create: no name");
 #endif
 	memset(&ndirent, 0, sizeof(ndirent));
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_ARCHIVE;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = 0;
 	ndirent.de_FileSize = 0;
 	ndirent.de_pmp = pdep->de_pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	vfs_timestamp(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 	return (0);
 
 bad:
 	return (error);
 }
 
 static int
 msdosfs_mknod(struct vop_mknod_args *ap)
 {
 
     return (EINVAL);
 }
 
 static int
 msdosfs_open(struct vop_open_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	vnode_create_vobject(ap->a_vp, dep->de_FileSize, ap->a_td);
 	return 0;
 }
 
 static int
 msdosfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct timespec ts;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1) {
 		vfs_timestamp(&ts);
 		DETIMES(dep, &ts, &ts, &ts);
 	}
 	VI_UNLOCK(vp);
 	return 0;
 }
 
 static int
 msdosfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	mode_t file_mode;
 	accmode_t accmode = ap->a_accmode;
 
 	file_mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	file_mode &= (vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 
 	/*
 	 * Disallow writing to directories and regular files if the
 	 * filesystem is read-only.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	return (vaccess(vp->v_type, file_mode, pmp->pm_uid, pmp->pm_gid,
 	    ap->a_accmode, ap->a_cred, NULL));
 }
 
 static int
 msdosfs_getattr(struct vop_getattr_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	mode_t mode;
 	struct timespec ts;
 	u_long dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 	uint64_t fileid;
 
 	vfs_timestamp(&ts);
 	DETIMES(dep, &ts, &ts, &ts);
 	vap->va_fsid = dev2udev(pmp->pm_dev);
 	/*
 	 * The following computation of the fileid must be the same as that
 	 * used in msdosfs_readdir() to compute d_fileno. If not, pwd
 	 * doesn't work.
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		fileid = (uint64_t)cntobn(pmp, dep->de_StartCluster) *
 		    dirsperblk;
 		if (dep->de_StartCluster == MSDOSFSROOT)
 			fileid = 1;
 	} else {
 		fileid = (uint64_t)cntobn(pmp, dep->de_dirclust) *
 		    dirsperblk;
 		if (dep->de_dirclust == MSDOSFSROOT)
 			fileid = (uint64_t)roottobn(pmp, 0) * dirsperblk;
 		fileid += (uoff_t)dep->de_diroffset / sizeof(struct direntry);
 	}
 	vap->va_fileid = fileid;
 
 	mode = S_IRWXU|S_IRWXG|S_IRWXO;
 	if (dep->de_Attributes & ATTR_READONLY)
 		mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH);
 	vap->va_mode = mode &
 	    (ap->a_vp->v_type == VDIR ? pmp->pm_dirmask : pmp->pm_mask);
 	vap->va_uid = pmp->pm_uid;
 	vap->va_gid = pmp->pm_gid;
 	vap->va_nlink = 1;
 	vap->va_rdev = NODEV;
 	vap->va_size = dep->de_FileSize;
 	fattime2timespec(dep->de_MDate, dep->de_MTime, 0, 0, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime;
 	if (pmp->pm_flags & MSDOSFSMNT_LONGNAME) {
 		fattime2timespec(dep->de_ADate, 0, 0, 0, &vap->va_atime);
 		fattime2timespec(dep->de_CDate, dep->de_CTime, dep->de_CHun,
 		    0, &vap->va_birthtime);
 	} else {
 		vap->va_atime = vap->va_mtime;
 		vap->va_birthtime.tv_sec = -1;
 		vap->va_birthtime.tv_nsec = 0;
 	}
 	vap->va_flags = 0;
 	if (dep->de_Attributes & ATTR_ARCHIVE)
 		vap->va_flags |= UF_ARCHIVE;
 	if (dep->de_Attributes & ATTR_HIDDEN)
 		vap->va_flags |= UF_HIDDEN;
 	if (dep->de_Attributes & ATTR_READONLY)
 		vap->va_flags |= UF_READONLY;
 	if (dep->de_Attributes & ATTR_SYSTEM)
 		vap->va_flags |= UF_SYSTEM;
 	vap->va_gen = 0;
 	vap->va_blocksize = pmp->pm_bpcluster;
 	vap->va_bytes =
 	    (dep->de_FileSize + pmp->pm_crbomask) & ~pmp->pm_crbomask;
 	vap->va_type = ap->a_vp->v_type;
 	vap->va_filerev = dep->de_modrev;
 	return (0);
 }
 
 static int
 msdosfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_setattr(): vp %p, vap %p, cred %p\n",
 	    ap->a_vp, vap, cred);
 #endif
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    (vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_setattr(): returning EINVAL\n");
 		printf("    va_type %d, va_nlink %llx, va_fsid %llx, va_fileid %llx\n",
 		    vap->va_type, (unsigned long long)vap->va_nlink,
 		    (unsigned long long)vap->va_fsid,
 		    (unsigned long long)vap->va_fileid);
 		printf("    va_blocksize %lx, va_rdev %llx, va_bytes %llx, va_gen %lx\n",
 		    vap->va_blocksize, (unsigned long long)vap->va_rdev,
 		    (unsigned long long)vap->va_bytes, vap->va_gen);
 		printf("    va_uid %x, va_gid %x\n",
 		    vap->va_uid, vap->va_gid);
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * We don't allow setting attributes on the root directory.
 	 * The special case for the root directory is because before
 	 * FAT32, the root directory didn't have an entry for itself
 	 * (and was otherwise special).  With FAT32, the root
 	 * directory is not so special, but still doesn't have an
 	 * entry for itself.
 	 */
 	if (vp->v_vflag & VV_ROOT)
 		return (EINVAL);
 
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN);
 			if (error)
 				return (error);
 		}
 		/*
 		 * We are very inconsistent about handling unsupported
 		 * attributes.  We ignored the access time and the
 		 * read and execute bits.  We were strict for the other
 		 * attributes.
 		 */
 		if (vap->va_flags & ~(UF_ARCHIVE | UF_HIDDEN | UF_READONLY |
 		    UF_SYSTEM))
 			return EOPNOTSUPP;
 		if (vap->va_flags & UF_ARCHIVE)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		else
 			dep->de_Attributes &= ~ATTR_ARCHIVE;
 		if (vap->va_flags & UF_HIDDEN)
 			dep->de_Attributes |= ATTR_HIDDEN;
 		else
 			dep->de_Attributes &= ~ATTR_HIDDEN;
 		/* We don't allow changing the readonly bit on directories. */
 		if (vp->v_type != VDIR) {
 			if (vap->va_flags & UF_READONLY)
 				dep->de_Attributes |= ATTR_READONLY;
 			else
 				dep->de_Attributes &= ~ATTR_READONLY;
 		}
 		if (vap->va_flags & UF_SYSTEM)
 			dep->de_Attributes |= ATTR_SYSTEM;
 		else
 			dep->de_Attributes &= ~ATTR_SYSTEM;
 		dep->de_flag |= DE_MODIFIED;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		uid_t uid;
 		gid_t gid;
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		uid = vap->va_uid;
 		if (uid == (uid_t)VNOVAL)
 			uid = pmp->pm_uid;
 		gid = vap->va_gid;
 		if (gid == (gid_t)VNOVAL)
 			gid = pmp->pm_gid;
 		if (cred->cr_uid != pmp->pm_uid || uid != pmp->pm_uid ||
 		    (gid != pmp->pm_gid && !groupmember(gid, cred))) {
 			error = priv_check_cred(cred, PRIV_VFS_CHOWN);
 			if (error)
 				return (error);
 		}
 		if (uid != pmp->pm_uid || gid != pmp->pm_gid)
 			return EINVAL;
 	}
 
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VREG:
 			/*
 			 * Truncation is only supported for regular files,
 			 * Disallow it if the filesystem is read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support any file types except regular
 			 * files and directories in this file system, so
 			 * this (default) case is unreachable and can do
 			 * anything.  Keep falling through to detrunc()
 			 * for now.
 			 */
 			break;
 		}
 		error = detrunc(dep, vap->va_size, 0, cred);
 		if (error)
 			return error;
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95) == 0 &&
 		    vap->va_atime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_ACCESS;
 			timespec2fattime(&vap->va_atime, 0,
 			    &dep->de_ADate, NULL, NULL);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			dep->de_flag &= ~DE_UPDATE;
 			timespec2fattime(&vap->va_mtime, 0,
 			    &dep->de_MDate, &dep->de_MTime, NULL);
 		}
 		/*
 		 * We don't set the archive bit when modifying the time of
 		 * a directory to emulate the Windows/DOS behavior.
 		 */
 		if (vp->v_type != VDIR)
 			dep->de_Attributes |= ATTR_ARCHIVE;
 		dep->de_flag |= DE_MODIFIED;
 	}
 	/*
 	 * DOS files only have the ability to have their writability
 	 * attribute set, so we use the owner write bit to set the readonly
 	 * attribute.
 	 */
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != pmp->pm_uid) {
 			error = priv_check_cred(cred, PRIV_VFS_ADMIN);
 			if (error)
 				return (error);
 		}
 		if (vp->v_type != VDIR) {
 			/* We ignore the read and execute bits. */
 			if (vap->va_mode & S_IWUSR)
 				dep->de_Attributes &= ~ATTR_READONLY;
 			else
 				dep->de_Attributes |= ATTR_READONLY;
 			dep->de_Attributes |= ATTR_ARCHIVE;
 			dep->de_flag |= DE_MODIFIED;
 		}
 	}
 	return (deupdat(dep, 0));
 }
 
 static int
 msdosfs_read(struct vop_read_args *ap)
 {
 	int error = 0;
 	int blsize;
 	int isadir;
 	ssize_t orig_resid;
 	u_int n;
 	u_long diff;
 	u_long on;
 	daddr_t lbn;
 	daddr_t rablock;
 	int rasize;
 	int seqcount;
 	struct buf *bp;
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct uio *uio = ap->a_uio;
 
 	/*
 	 * If they didn't ask for any data, then we are done.
 	 */
 	orig_resid = uio->uio_resid;
 	if (orig_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 * We don't need to check for large offsets as in ffs because
 	 * dep->de_FileSize <= MSDOSFS_FILESIZE_MAX < OFF_MAX, so large
 	 * offsets cannot cause overflow even in theory.
 	 */
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 
 	isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	do {
 		if (uio->uio_offset >= dep->de_FileSize)
 			break;
 		lbn = de_cluster(pmp, uio->uio_offset);
 		rablock = lbn + 1;
 		blsize = pmp->pm_bpcluster;
 		on = uio->uio_offset & pmp->pm_crbomask;
 		/*
 		 * If we are operating on a directory file then be sure to
 		 * do i/o with the vnode for the filesystem instead of the
 		 * vnode for the directory.
 		 */
 		if (isadir) {
 			/* convert cluster # to block # */
 			error = pcbmap(dep, lbn, &lbn, 0, &blsize);
 			if (error == E2BIG) {
 				error = EINVAL;
 				break;
 			} else if (error)
 				break;
 			error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
 		} else if (de_cn2off(pmp, rablock) >= dep->de_FileSize) {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, dep->de_FileSize, lbn, blsize,
 			    NOCRED, on + uio->uio_resid, seqcount, 0, &bp);
 		} else if (seqcount > 1) {
 			rasize = blsize;
 			error = breadn(vp, lbn,
 			    blsize, &rablock, &rasize, 1, NOCRED, &bp);
 		} else {
 			error = bread(vp, lbn, blsize, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			break;
 		}
 		diff = pmp->pm_bpcluster - on;
 		n = diff > uio->uio_resid ? uio->uio_resid : diff;
 		diff = dep->de_FileSize - uio->uio_offset;
 		if (diff < n)
 			n = diff;
 		diff = blsize - bp->b_resid;
 		if (diff < n)
 			n = diff;
 		error = vn_io_fault_uiomove(bp->b_data + on, (int) n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	if (!isadir && (error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		dep->de_flag |= DE_ACCESS;
 	return (error);
 }
 
 /*
  * Write data to a file or directory.
  */
 static int
 msdosfs_write(struct vop_write_args *ap)
 {
 	int n;
 	int croffset;
 	ssize_t resid;
 	u_long osize;
 	int error = 0;
 	u_long count;
 	int seqcount;
 	daddr_t bn, lastcn;
 	struct buf *bp;
 	int ioflag = ap->a_ioflag;
 	struct uio *uio = ap->a_uio;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *thisvp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct ucred *cred = ap->a_cred;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_write(vp %p, uio %p, ioflag %x, cred %p\n",
 	    vp, uio, ioflag, cred);
 	printf("msdosfs_write(): diroff %lu, dirclust %lu, startcluster %lu\n",
 	    dep->de_diroffset, dep->de_dirclust, dep->de_StartCluster);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = dep->de_FileSize;
 		thisvp = vp;
 		break;
 	case VDIR:
 		return EISDIR;
 	default:
 		panic("msdosfs_write(): bad file type");
 	}
 
 	/*
 	 * This is needed (unlike in ffs_write()) because we extend the
 	 * file outside of the loop but we don't want to extend the file
 	 * for writes of 0 bytes.
 	 */
 	if (uio->uio_resid == 0)
 		return (0);
 
 	/*
 	 * The caller is supposed to ensure that
 	 * uio->uio_offset >= 0 and uio->uio_resid >= 0.
 	 */
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > MSDOSFS_FILESIZE_MAX)
 		return (EFBIG);
 
 	/*
 	 * If they've exceeded their filesize limit, tell them about it.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	/*
 	 * If the offset we are starting the write at is beyond the end of
 	 * the file, then they've done a seek.  Unix filesystems allow
 	 * files with holes in them, DOS doesn't so we must fill the hole
 	 * with zeroed blocks.
 	 */
 	if (uio->uio_offset > dep->de_FileSize) {
 		error = deextend(dep, uio->uio_offset, cred);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Remember some values in case the write fails.
 	 */
 	resid = uio->uio_resid;
 	osize = dep->de_FileSize;
 
 	/*
 	 * If we write beyond the end of the file, extend it to its ultimate
 	 * size ahead of the time to hopefully get a contiguous area.
 	 */
 	if (uio->uio_offset + resid > osize) {
 		count = de_clcount(pmp, uio->uio_offset + resid) -
 			de_clcount(pmp, osize);
 		error = extendfile(dep, count, NULL, NULL, 0);
 		if (error &&  (error != ENOSPC || (ioflag & IO_UNIT)))
 			goto errexit;
 		lastcn = dep->de_fc[FC_LASTFC].fc_frcn;
 	} else
 		lastcn = de_clcount(pmp, osize) - 1;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	do {
 		if (de_cluster(pmp, uio->uio_offset) > lastcn) {
 			error = ENOSPC;
 			break;
 		}
 
 		croffset = uio->uio_offset & pmp->pm_crbomask;
 		n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
 		if (uio->uio_offset + n > dep->de_FileSize) {
 			dep->de_FileSize = uio->uio_offset + n;
 			/* The object size needs to be set before buffer is allocated */
 			vnode_pager_setsize(vp, dep->de_FileSize);
 		}
 
 		bn = de_cluster(pmp, uio->uio_offset);
 		if ((uio->uio_offset & pmp->pm_crbomask) == 0
 		    && (de_cluster(pmp, uio->uio_offset + uio->uio_resid)
 			> de_cluster(pmp, uio->uio_offset)
 			|| uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) {
 			/*
 			 * If either the whole cluster gets written,
 			 * or we write the cluster from its start beyond EOF,
 			 * then no need to read data from disk.
 			 */
 			bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 			/*
 			 * This call to vfs_bio_clrbuf() ensures that
 			 * even if vn_io_fault_uiomove() below faults,
 			 * garbage from the newly instantiated buffer
 			 * is not exposed to the userspace via mmap().
 			 */
 			vfs_bio_clrbuf(bp);
 			/*
 			 * Do the bmap now, since pcbmap needs buffers
 			 * for the FAT table. (see msdosfs_strategy)
 			 */
 			if (bp->b_blkno == bp->b_lblkno) {
 				error = pcbmap(dep, bp->b_lblkno, &bn, 0, 0);
 				if (error)
 					bp->b_blkno = -1;
 				else
 					bp->b_blkno = bn;
 			}
 			if (bp->b_blkno == -1) {
 				brelse(bp);
 				if (!error)
 					error = EIO;		/* XXX */
 				break;
 			}
 		} else {
 			/*
 			 * The block we need to write into exists, so read it in.
 			 */
 			error = bread(thisvp, bn, pmp->pm_bpcluster, cred, &bp);
 			if (error) {
 				break;
 			}
 		}
 
 		/*
 		 * Should these vnode_pager_* functions be done on dir
 		 * files?
 		 */
 
 		/*
 		 * Copy the data from user space into the buf header.
 		 */
 		error = vn_io_fault_uiomove(bp->b_data + croffset, n, uio);
 		if (error) {
 			brelse(bp);
 			break;
 		}
 
 		/* Prepare for clustered writes in some else clauses. */
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 			bp->b_flags |= B_CLUSTEROK;
 
 		/*
 		 * If IO_SYNC, then each buffer is written synchronously.
 		 * Otherwise, if we have a severe page deficiency then
 		 * write the buffer asynchronously.  Otherwise, if on a
 		 * cluster boundary then write the buffer asynchronously,
 		 * combining it with contiguous clusters if permitted and
 		 * possible, since we don't expect more writes into this
 		 * buffer soon.  Otherwise, do a delayed write because we
 		 * expect more writes into this buffer soon.
 		 */
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (vm_page_count_severe() || buf_dirty_count_severe())
 			bawrite(bp);
 		else if (n + croffset == pmp->pm_bpcluster) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0)
 				cluster_write(vp, bp, dep->de_FileSize,
 				    seqcount, 0);
 			else
 				bawrite(bp);
 		} else
 			bdwrite(bp);
 		dep->de_flag |= DE_UPDATE;
 	} while (error == 0 && uio->uio_resid > 0);
 
 	/*
 	 * If the write failed and they want us to, truncate the file back
 	 * to the size it was before the write was attempted.
 	 */
 errexit:
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			detrunc(dep, osize, ioflag & IO_SYNC, NOCRED);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
 			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
 	} else if (ioflag & IO_SYNC)
 		error = deupdat(dep, 1);
 	return (error);
 }
 
 /*
  * Flush the blocks of a file to disk.
  */
 static int
 msdosfs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *devvp;
 	int allerror, error;
 
 	vop_stdfsync(ap);
 
 	/*
 	* If the syncing request comes from fsync(2), sync the entire
 	* FAT and any other metadata that happens to be on devvp.  We
 	* need this mainly for the FAT.  We write the FAT sloppily, and
 	* syncing it all now is the best we can easily do to get all
 	* directory entries associated with the file (not just the file)
 	* fully synced.  The other metadata includes critical metadata
 	* for all directory entries, but only in the MNT_ASYNC case.  We
 	* will soon sync all metadata in the file's directory entry.
 	* Non-critical metadata for associated directory entries only
 	* gets synced accidentally, as in most file systems.
 	*/
 	if (ap->a_waitfor == MNT_WAIT) {
 		devvp = VTODE(ap->a_vp)->de_pmp->pm_devvp;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 		allerror = VOP_FSYNC(devvp, MNT_WAIT, ap->a_td);
 		VOP_UNLOCK(devvp, 0);
 	} else
 		allerror = 0;
 
 	error = deupdat(VTODE(ap->a_vp), ap->a_waitfor == MNT_WAIT);
 	if (allerror == 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 msdosfs_remove(struct vop_remove_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 	struct denode *ddep = VTODE(ap->a_dvp);
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
 		error = EPERM;
 	else
 		error = removede(ddep, dep);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_remove(), dep %p, v_usecount %d\n", dep, ap->a_vp->v_usecount);
 #endif
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what links are.
  */
 static int
 msdosfs_link(struct vop_link_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 /*
  * Renames on files require moving the denode to a new hash queue since the
  * denode's location is used to compute which hash queue to put the file
  * in. Unless it is a rename in place.  For example "mv a b".
  *
  * What follows is the basic algorithm:
  *
  * if (file move) {
  *	if (dest file exists) {
  *		remove dest file
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing directory slot
  *	} else {
  *		write new entry in dest directory
  *		update offset and dirclust in denode
  *		move denode to new hash chain
  *		clear old directory entry
  *	}
  * } else {
  *	directory move
  *	if (dest directory exists) {
  *		if (dest is not empty) {
  *			return ENOTEMPTY
  *		}
  *		remove dest directory
  *	}
  *	if (dest and src in same directory) {
  *		rewrite name in existing entry
  *	} else {
  *		be sure dest is not a child of src directory
  *		write entry in dest directory
  *		update "." and ".." in moved directory
  *		clear old directory entry for moved directory
  *	}
  * }
  *
  * On entry:
  *	source's parent directory is unlocked
  *	source file or directory is unlocked
  *	destination's parent directory is locked
  *	destination file or directory is locked if it exists
  *
  * On exit:
  *	all denodes should be released
  */
 static int
 msdosfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct denode *ip, *xp, *dp, *zp;
 	u_char toname[12], oldname[11];
 	u_long from_diroffset, to_diroffset;
 	u_char to_count;
 	int doingdirectory = 0, newparent = 0;
 	int error;
 	u_long cn, pcl;
 	daddr_t bn;
 	struct msdosfsmount *pmp;
 	struct direntry *dotdotp;
 	struct buf *bp;
 
 	pmp = VFSTOMSDOSFS(fdvp->v_mount);
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	/*
 	 * If source and dest are the same, do nothing.
 	 */
 	if (tvp == fvp) {
 		error = 0;
 		goto abortit;
 	}
 
 	error = vn_lock(fvp, LK_EXCLUSIVE);
 	if (error)
 		goto abortit;
 	dp = VTODE(fdvp);
 	ip = VTODE(fvp);
 
 	/*
 	 * Be sure we are not renaming ".", "..", or an alias of ".". This
 	 * leads to a crippled directory tree.  It's pretty tough to do a
 	 * "ls" or "pwd" with the "." directory entry missing, and "cd .."
 	 * doesn't work if the ".." entry is missing.
 	 */
 	if (ip->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip ||
 		    (fcnp->cn_flags & ISDOTDOT) ||
 		    (tcnp->cn_flags & ISDOTDOT) ||
 		    (ip->de_flag & DE_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->de_flag |= DE_RENAME;
 		doingdirectory++;
 	}
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTODE(tdvp);
 	xp = tvp ? VTODE(tvp) : NULL;
 	/*
 	 * Remember direntry place to use for destination
 	 */
 	to_diroffset = dp->de_fndoffset;
 	to_count = dp->de_fndcnt;
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to doscheckpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (VTODE(fdvp)->de_StartCluster != VTODE(tdvp)->de_StartCluster)
 		newparent = 1;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		/*
 		 * doscheckpath() vput()'s dp,
 		 * so we have to do a relookup afterwards
 		 */
 		error = doscheckpath(ip, dp);
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("msdosfs_rename: lost to startdir");
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		dp = VTODE(tdvp);
 		xp = tvp ? VTODE(tvp) : NULL;
 	}
 
 	if (xp != NULL) {
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if (xp->de_Attributes & ATTR_DIRECTORY) {
 			if (!dosdirempty(xp)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = removede(dp, xp);
 		if (error)
 			goto bad;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * Convert the filename in tcnp into a dos filename. We copy this
 	 * into the denode and directory entry for the destination
 	 * file/directory.
 	 */
 	error = uniqdosname(VTODE(tdvp), tcnp, toname);
 	if (error)
 		goto abortit;
 
 	/*
 	 * Since from wasn't locked at various places above,
 	 * have to do a relookup here.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("msdosfs_rename: lost from startdir");
 	if (!newparent)
 		VOP_UNLOCK(tdvp, 0);
 	if (relookup(fdvp, &fvp, fcnp) == 0)
 		vrele(fdvp);
 	if (fvp == NULL) {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		if (newparent)
 			VOP_UNLOCK(tdvp, 0);
 		vrele(tdvp);
 		vrele(ap->a_fvp);
 		return 0;
 	}
 	xp = VTODE(fvp);
 	zp = VTODE(fdvp);
 	from_diroffset = zp->de_fndoffset;
 
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed till now. If the source is a file the entry may
 	 * have been unlinked or renamed. In either case there is
 	 * no further work to be done. If the source is a directory
 	 * then it cannot have been rmdir'ed or renamed; this is
 	 * prohibited by the DE_RENAME flag.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		VOP_UNLOCK(fvp, 0);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 		vrele(ap->a_fvp);
 		xp = NULL;
 	} else {
 		vrele(fvp);
 		xp = NULL;
 
 		/*
 		 * First write a new entry in the destination
 		 * directory and mark the entry in the source directory
 		 * as deleted.  Then move the denode to the correct hash
 		 * chain for its new location in the filesystem.  And, if
 		 * we moved a directory, then update its .. entry to point
 		 * to the new parent directory.
 		 */
 		memcpy(oldname, ip->de_Name, 11);
 		memcpy(ip->de_Name, toname, 11);	/* update denode */
 		dp->de_fndoffset = to_diroffset;
 		dp->de_fndcnt = to_count;
 		error = createde(ip, dp, (struct denode **)0, tcnp);
 		if (error) {
 			memcpy(ip->de_Name, oldname, 11);
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		/*
 		 * If ip is for a directory, then its name should always
 		 * be "." since it is for the directory entry in the
 		 * directory itself (msdosfs_lookup() always translates
 		 * to the "." entry so as to get a unique denode, except
 		 * for the root directory there are different
 		 * complications).  However, we just corrupted its name
 		 * to pass the correct name to createde().  Undo this.
 		 */
 		if ((ip->de_Attributes & ATTR_DIRECTORY) != 0)
 			memcpy(ip->de_Name, oldname, 11);
 		ip->de_refcnt++;
 		zp->de_fndoffset = from_diroffset;
 		error = removede(zp, ip);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			if (newparent)
 				VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		if (!doingdirectory) {
 			error = pcbmap(dp, de_cluster(pmp, to_diroffset), 0,
 				       &ip->de_dirclust, 0);
 			if (error) {
 				/* XXX should downgrade to ro here, fs is corrupt */
 				if (newparent)
 					VOP_UNLOCK(fdvp, 0);
 				VOP_UNLOCK(fvp, 0);
 				goto bad;
 			}
 			if (ip->de_dirclust == MSDOSFSROOT)
 				ip->de_diroffset = to_diroffset;
 			else
 				ip->de_diroffset = to_diroffset & pmp->pm_crbomask;
 		}
 		reinsert(ip);
 		if (newparent)
 			VOP_UNLOCK(fdvp, 0);
 	}
 
 	/*
 	 * If we moved a directory to a new parent directory, then we must
 	 * fixup the ".." entry in the moved directory.
 	 */
 	if (doingdirectory && newparent) {
 		cn = ip->de_StartCluster;
 		if (cn == MSDOSFSROOT) {
 			/* this should never happen */
 			panic("msdosfs_rename(): updating .. in root directory?");
 		} else
 			bn = cntobn(pmp, cn);
 		error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			      NOCRED, &bp);
 		if (error) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 		dotdotp = (struct direntry *)bp->b_data + 1;
 		pcl = dp->de_StartCluster;
 		if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 			pcl = MSDOSFSROOT;
 		putushort(dotdotp->deStartCluster, pcl);
 		if (FAT32(pmp))
 			putushort(dotdotp->deHighClust, pcl >> 16);
 		if (DOINGASYNC(fvp))
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0) {
 			/* XXX should downgrade to ro here, fs is corrupt */
 			VOP_UNLOCK(fvp, 0);
 			goto bad;
 		}
 	}
 
 	/*
 	 * The msdosfs lookup is case insensitive. Several aliases may
 	 * be inserted for a single directory entry. As a consequnce,
 	 * name cache purge done by lookup for fvp when DELETE op for
 	 * namei is specified, might be not enough to expunge all
 	 * namecache entries that were installed for this direntry.
 	 */
 	cache_purge(fvp);
 	VOP_UNLOCK(fvp, 0);
 bad:
 	if (xp)
 		vput(tvp);
 	vput(tdvp);
 out:
 	ip->de_flag &= ~DE_RENAME;
 	vrele(fdvp);
 	vrele(fvp);
 	return (error);
 
 }
 
 static struct {
 	struct direntry dot;
 	struct direntry dotdot;
 } dosdirtemplate = {
 	{	".          ",				/* the . entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	},
 	{	"..         ",				/* the .. entry */
 		ATTR_DIRECTORY,				/* file attribute */
 		0,					/* reserved */
 		0, { 0, 0 }, { 0, 0 },			/* create time & date */
 		{ 0, 0 },				/* access date */
 		{ 0, 0 },				/* high bits of start cluster */
 		{ 210, 4 }, { 210, 4 },			/* modify time & date */
 		{ 0, 0 },				/* startcluster */
 		{ 0, 0, 0, 0 }				/* filesize */
 	}
 };
 
 static int
 msdosfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *dep;
 	struct denode *pdep = VTODE(ap->a_dvp);
 	struct direntry *denp;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	struct buf *bp;
 	u_long newcluster, pcl;
 	int bn;
 	int error;
 	struct denode ndirent;
 	struct timespec ts;
 
 	/*
 	 * If this is the root directory and there is no space left we
 	 * can't do anything.  This is because the root directory can not
 	 * change size.
 	 */
 	if (pdep->de_StartCluster == MSDOSFSROOT
 	    && pdep->de_fndoffset >= pdep->de_FileSize) {
 		error = ENOSPC;
 		goto bad2;
 	}
 
 	/*
 	 * Allocate a cluster to hold the about to be created directory.
 	 */
 	error = clusteralloc(pmp, 0, 1, CLUST_EOFE, &newcluster, NULL);
 	if (error)
 		goto bad2;
 
 	memset(&ndirent, 0, sizeof(ndirent));
 	ndirent.de_pmp = pmp;
 	ndirent.de_flag = DE_ACCESS | DE_CREATE | DE_UPDATE;
 	vfs_timestamp(&ts);
 	DETIMES(&ndirent, &ts, &ts, &ts);
 
 	/*
 	 * Now fill the cluster with the "." and ".." entries. And write
 	 * the cluster to disk.  This way it is there for the parent
 	 * directory to be pointing at if there were a crash.
 	 */
 	bn = cntobn(pmp, newcluster);
 	/* always succeeds */
 	bp = getblk(pmp->pm_devvp, bn, pmp->pm_bpcluster, 0, 0, 0);
 	memset(bp->b_data, 0, pmp->pm_bpcluster);
 	memcpy(bp->b_data, &dosdirtemplate, sizeof dosdirtemplate);
 	denp = (struct direntry *)bp->b_data;
 	putushort(denp[0].deStartCluster, newcluster);
 	putushort(denp[0].deCDate, ndirent.de_CDate);
 	putushort(denp[0].deCTime, ndirent.de_CTime);
 	denp[0].deCHundredth = ndirent.de_CHun;
 	putushort(denp[0].deADate, ndirent.de_ADate);
 	putushort(denp[0].deMDate, ndirent.de_MDate);
 	putushort(denp[0].deMTime, ndirent.de_MTime);
 	pcl = pdep->de_StartCluster;
 	/*
 	 * Although the root directory has a non-magic starting cluster
 	 * number for FAT32, chkdsk and fsck_msdosfs still require
 	 * references to it in dotdot entries to be magic.
 	 */
 	if (FAT32(pmp) && pcl == pmp->pm_rootdirblk)
 		pcl = MSDOSFSROOT;
 	putushort(denp[1].deStartCluster, pcl);
 	putushort(denp[1].deCDate, ndirent.de_CDate);
 	putushort(denp[1].deCTime, ndirent.de_CTime);
 	denp[1].deCHundredth = ndirent.de_CHun;
 	putushort(denp[1].deADate, ndirent.de_ADate);
 	putushort(denp[1].deMDate, ndirent.de_MDate);
 	putushort(denp[1].deMTime, ndirent.de_MTime);
 	if (FAT32(pmp)) {
 		putushort(denp[0].deHighClust, newcluster >> 16);
 		putushort(denp[1].deHighClust, pcl >> 16);
 	}
 
 	if (DOINGASYNC(ap->a_dvp))
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		goto bad;
 
 	/*
 	 * Now build up a directory entry pointing to the newly allocated
 	 * cluster.  This will be written to an empty slot in the parent
 	 * directory.
 	 */
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("msdosfs_mkdir: no name");
 #endif
 	error = uniqdosname(pdep, cnp, ndirent.de_Name);
 	if (error)
 		goto bad;
 
 	ndirent.de_Attributes = ATTR_DIRECTORY;
 	ndirent.de_LowerCase = 0;
 	ndirent.de_StartCluster = newcluster;
 	ndirent.de_FileSize = 0;
 	error = createde(&ndirent, pdep, &dep, cnp);
 	if (error)
 		goto bad;
 	*ap->a_vpp = DETOV(dep);
 	return (0);
 
 bad:
 	clusterfree(pmp, newcluster, NULL);
 bad2:
 	return (error);
 }
 
 static int
 msdosfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct denode *ip, *dp;
 	int error;
 
 	ip = VTODE(vp);
 	dp = VTODE(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (!dosdirempty(ip) || ip->de_flag & DE_RENAME) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	/*
 	 * Delete the entry from the directory.  For dos filesystems this
 	 * gets rid of the directory entry on disk, the in memory copy
 	 * still exists but the de_refcnt is <= 0.  This prevents it from
 	 * being found by deget().  When the vput() on dep is done we give
 	 * up access and eventually msdosfs_reclaim() will be called which
 	 * will remove it from the denode cache.
 	 */
 	error = removede(dp, ip);
 	if (error)
 		goto out;
 	/*
 	 * This is where we decrement the link count in the parent
 	 * directory.  Since dos filesystems don't do this we just purge
 	 * the name cache.
 	 */
 	cache_purge(dvp);
 	/*
 	 * Truncate the directory that is being deleted.
 	 */
 	error = detrunc(ip, (u_long)0, IO_SYNC, cnp->cn_cred);
 	cache_purge(vp);
 
 out:
 	return (error);
 }
 
 /*
  * DOS filesystems don't know what symlinks are.
  */
 static int
 msdosfs_symlink(struct vop_symlink_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 msdosfs_readdir(struct vop_readdir_args *ap)
 {
 	struct mbnambuf nb;
 	int error = 0;
 	int diff;
 	long n;
 	int blsize;
 	long on;
 	u_long cn;
 	u_long dirsperblk;
 	long bias = 0;
 	daddr_t bn, lbn;
 	struct buf *bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	struct dirent dirbuf;
 	struct uio *uio = ap->a_uio;
 	u_long *cookies = NULL;
 	int ncookies = 0;
 	off_t offset, off;
 	int chksum = -1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_readdir(): vp %p, uio %p, cred %p, eofflagp %p\n",
 	    ap->a_vp, uio, ap->a_cred, ap->a_eofflag);
 #endif
 
 	/*
 	 * msdosfs_readdir() won't operate properly on regular files since
 	 * it does i/o only with the filesystem vnode, and hence can
 	 * retrieve the wrong block from the buffer cache for a plain file.
 	 * So, fail attempts to readdir() on a plain file.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0)
 		return (ENOTDIR);
 
 	/*
 	 * To be safe, initialize dirbuf
 	 */
 	memset(dirbuf.d_name, 0, sizeof(dirbuf.d_name));
 
 	/*
 	 * If the user buffer is smaller than the size of one dos directory
 	 * entry or the file offset is not a multiple of the size of a
 	 * directory entry, then we fail the read.
 	 */
 	off = offset = uio->uio_offset;
 	if (uio->uio_resid < sizeof(struct direntry) ||
 	    (offset & (sizeof(struct direntry) - 1)))
 		return (EINVAL);
 
 	if (ap->a_ncookies) {
 		ncookies = uio->uio_resid / 16;
 		cookies = malloc(ncookies * sizeof(u_long), M_TEMP,
 		       M_WAITOK);
 		*ap->a_cookies = cookies;
 		*ap->a_ncookies = ncookies;
 	}
 
 	dirsperblk = pmp->pm_BytesPerSec / sizeof(struct direntry);
 
 	/*
 	 * If they are reading from the root directory then, we simulate
 	 * the . and .. entries since these don't exist in the root
 	 * directory.  We also set the offset bias to make up for having to
 	 * simulate these entries. By this I mean that at file offset 64 we
 	 * read the first entry in the root directory that lives on disk.
 	 */
 	if (dep->de_StartCluster == MSDOSFSROOT
 	    || (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)) {
 #if 0
 		printf("msdosfs_readdir(): going after . or .. in root dir, offset %d\n",
 		    offset);
 #endif
 		bias = 2 * sizeof(struct direntry);
 		if (offset < bias) {
 			for (n = (int)offset / sizeof(struct direntry);
 			     n < 2; n++) {
 				dirbuf.d_fileno = FAT32(pmp) ?
 				    (uint64_t)cntobn(pmp, pmp->pm_rootdirblk) *
 				    dirsperblk : 1;
 				dirbuf.d_type = DT_DIR;
 				switch (n) {
 				case 0:
 					dirbuf.d_namlen = 1;
 					dirbuf.d_name[0] = '.';
 					break;
 				case 1:
 					dirbuf.d_namlen = 2;
 					dirbuf.d_name[0] = '.';
 					dirbuf.d_name[1] = '.';
 					break;
 				}
 				dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 				/* NOTE: d_off is the offset of the *next* entry. */
 				dirbuf.d_off = offset + sizeof(struct direntry);
 				dirent_terminate(&dirbuf);
 				if (uio->uio_resid < dirbuf.d_reclen)
 					goto out;
 				error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 				if (error)
 					goto out;
 				offset += sizeof(struct direntry);
 				off = offset;
 				if (cookies) {
 					*cookies++ = offset;
 					if (--ncookies <= 0)
 						goto out;
 				}
 			}
 		}
 	}
 
 	mbnambuf_init(&nb);
 	off = offset;
 	while (uio->uio_resid > 0) {
 		lbn = de_cluster(pmp, offset - bias);
 		on = (offset - bias) & pmp->pm_crbomask;
 		n = min(pmp->pm_bpcluster - on, uio->uio_resid);
 		diff = dep->de_FileSize - (offset - bias);
 		if (diff <= 0)
 			break;
 		n = min(n, diff);
 		error = pcbmap(dep, lbn, &bn, &cn, &blsize);
 		if (error)
 			break;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			return (error);
 		}
 		n = min(n, blsize - bp->b_resid);
 		if (n == 0) {
 			brelse(bp);
 			return (EIO);
 		}
 
 		/*
 		 * Convert from dos directory entries to fs-independent
 		 * directory entries.
 		 */
 		for (dentp = (struct direntry *)(bp->b_data + on);
 		     (char *)dentp < bp->b_data + on + n;
 		     dentp++, offset += sizeof(struct direntry)) {
 #if 0
 			printf("rd: dentp %08x prev %08x crnt %08x deName %02x attr %02x\n",
 			    dentp, prev, crnt, dentp->deName[0], dentp->deAttributes);
 #endif
 			/*
 			 * If this is an unused entry, we can stop.
 			 */
 			if (dentp->deName[0] == SLOT_EMPTY) {
 				brelse(bp);
 				goto out;
 			}
 			/*
 			 * Skip deleted entries.
 			 */
 			if (dentp->deName[0] == SLOT_DELETED) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 
 			/*
 			 * Handle Win95 long directory entries
 			 */
 			if (dentp->deAttributes == ATTR_WIN95) {
 				if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 					continue;
 				chksum = win2unixfn(&nb,
 				    (struct winentry *)dentp, chksum, pmp);
 				continue;
 			}
 
 			/*
 			 * Skip volume labels
 			 */
 			if (dentp->deAttributes & ATTR_VOLUME) {
 				chksum = -1;
 				mbnambuf_init(&nb);
 				continue;
 			}
 			/*
 			 * This computation of d_fileno must match
 			 * the computation of va_fileid in
 			 * msdosfs_getattr.
 			 */
 			if (dentp->deAttributes & ATTR_DIRECTORY) {
 				cn = getushort(dentp->deStartCluster);
 				if (FAT32(pmp)) {
 					cn |= getushort(dentp->deHighClust) <<
 					    16;
 					if (cn == MSDOSFSROOT)
 						cn = pmp->pm_rootdirblk;
 				}
 				if (cn == MSDOSFSROOT && !FAT32(pmp))
 					dirbuf.d_fileno = 1;
 				else
 					dirbuf.d_fileno = cntobn(pmp, cn) *
 					    dirsperblk;
 				dirbuf.d_type = DT_DIR;
 			} else {
 				dirbuf.d_fileno = (uoff_t)offset /
 				    sizeof(struct direntry);
 				dirbuf.d_type = DT_REG;
 			}
 
 			if (chksum != winChksum(dentp->deName)) {
 				dirbuf.d_namlen = dos2unixfn(dentp->deName,
 				    (u_char *)dirbuf.d_name,
 				    dentp->deLowerCase |
 					((pmp->pm_flags & MSDOSFSMNT_SHORTNAME) ?
 					(LCASE_BASE | LCASE_EXT) : 0),
 				    pmp);
 				mbnambuf_init(&nb);
 			} else
 				mbnambuf_flush(&nb, &dirbuf);
 			chksum = -1;
 			dirbuf.d_reclen = GENERIC_DIRSIZ(&dirbuf);
 			/* NOTE: d_off is the offset of the *next* entry. */
 			dirbuf.d_off = offset + sizeof(struct direntry);
 			if (uio->uio_resid < dirbuf.d_reclen) {
 				brelse(bp);
 				goto out;
 			}
 			error = uiomove(&dirbuf, dirbuf.d_reclen, uio);
 			if (error) {
 				brelse(bp);
 				goto out;
 			}
 			if (cookies) {
 				*cookies++ = offset + sizeof(struct direntry);
 				if (--ncookies <= 0) {
 					brelse(bp);
 					goto out;
 				}
 			}
 			off = offset + sizeof(struct direntry);
 		}
 		brelse(bp);
 	}
 out:
 	/* Subtract unused cookies */
 	if (ap->a_ncookies)
 		*ap->a_ncookies -= ncookies;
 
 	uio->uio_offset = off;
 
 	/*
 	 * Set the eofflag (NFS uses it)
 	 */
 	if (ap->a_eofflag) {
 		if (dep->de_FileSize - (offset - bias) <= 0)
 			*ap->a_eofflag = 1;
 		else
 			*ap->a_eofflag = 0;
 	}
 	return (error);
 }
 
 /*-
  * a_vp   - pointer to the file's vnode
  * a_bn   - logical block number within the file (cluster number for us)
  * a_bop  - where to return the bufobj of the special file containing the fs
  * a_bnp  - where to return the "physical" block number corresponding to a_bn
  *          (relative to the special file; units are blocks of size DEV_BSIZE)
  * a_runp - where to return the "run past" a_bn.  This is the count of logical
  *          blocks whose physical blocks (together with a_bn's physical block)
  *          are contiguous.
  * a_runb - where to return the "run before" a_bn.
  */
 static int
 msdosfs_bmap(struct vop_bmap_args *ap)
 {
 	struct fatcache savefc;
 	struct denode *dep;
 	struct mount *mp;
 	struct msdosfsmount *pmp;
 	struct vnode *vp;
 	daddr_t runbn;
 	u_long cn;
 	int bnpercn, error, maxio, maxrun, run;
 
 	vp = ap->a_vp;
 	dep = VTODE(vp);
 	pmp = dep->de_pmp;
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &pmp->pm_devvp->v_bufobj;
 	if (ap->a_bnp == NULL)
 		return (0);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	cn = ap->a_bn;
 	if (cn != ap->a_bn)
 		return (EFBIG);
 	error = pcbmap(dep, cn, ap->a_bnp, NULL, NULL);
 	if (error != 0 || (ap->a_runp == NULL && ap->a_runb == NULL))
 		return (error);
 
 	/*
 	 * Prepare to back out updates of the fatchain cache after the one
 	 * for the first block done by pcbmap() above.  Without the backout,
 	 * then whenever the caller doesn't do i/o to all of the blocks that
 	 * we find, the single useful cache entry would be too far in advance
 	 * of the actual i/o to work for the next sequential i/o.  Then the
 	 * FAT would be searched from the beginning.  With the backout, the
 	 * FAT is searched starting at most a few blocks early.  This wastes
 	 * much less time.  Time is also wasted finding more blocks than the
 	 * caller will do i/o to.  This is necessary because the runlength
 	 * parameters are output-only.
 	 */
 	savefc = dep->de_fc[FC_LASTMAP];
 
 	mp = vp->v_mount;
 	maxio = mp->mnt_iosize_max / mp->mnt_stat.f_iosize;
 	bnpercn = de_cn2bn(pmp, 1);
 	if (ap->a_runp != NULL) {
 		maxrun = ulmin(maxio - 1, pmp->pm_maxcluster - cn);
 		for (run = 1; run <= maxrun; run++) {
 			if (pcbmap(dep, cn + run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp + run * bnpercn)
 				break;
 		}
 		*ap->a_runp = run - 1;
 	}
 	if (ap->a_runb != NULL) {
 		maxrun = ulmin(maxio - 1, cn);
 		for (run = 1; run < maxrun; run++) {
 			if (pcbmap(dep, cn - run, &runbn, NULL, NULL) != 0 ||
 			    runbn != *ap->a_bnp - run * bnpercn)
 				break;
 		}
 		*ap->a_runb = run - 1;
 	}
 	dep->de_fc[FC_LASTMAP] = savefc;
 	return (0);
 }
 
 SYSCTL_NODE(_vfs, OID_AUTO, msdosfs, CTLFLAG_RW, 0, "msdos filesystem");
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_msdosfs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN,
     &use_buf_pager, 0,
     "Use buffer pager instead of bmap");
 
 static daddr_t
 msdosfs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (de_cluster(VTODE(vp)->de_pmp, off));
 }
 
 static int
 msdosfs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 
 	return (VTODE(vp)->de_pmp->pm_bpcluster);
 }
 
 static int
 msdosfs_getpages(struct vop_getpages_args *ap)
 {
 
 	if (use_buf_pager)
 		return (vfs_bio_getpages(ap->a_vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, msdosfs_gbp_getblkno,
 		    msdosfs_gbp_getblksz));
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 static int
 msdosfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct denode *dep = VTODE(ap->a_vp);
 	struct bufobj *bo;
 	int error = 0;
 	daddr_t blkno;
 
 	/*
 	 * If we don't already know the filesystem relative block number
 	 * then get it using pcbmap().  If pcbmap() returns the block
 	 * number as -1 then we've got a hole in the file.  DOS filesystems
 	 * don't allow files with holes, so we shouldn't ever see this.
 	 */
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = pcbmap(dep, bp->b_lblkno, &blkno, 0, 0);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if (bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	/*
 	 * Read/write the block from/to the disk that contains the desired
 	 * file block.
 	 */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = dep->de_pmp->pm_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 msdosfs_print(struct vop_print_args *ap)
 {
 	struct denode *dep = VTODE(ap->a_vp);
 
 	printf("\tstartcluster %lu, dircluster %lu, diroffset %lu, ",
 	       dep->de_StartCluster, dep->de_dirclust, dep->de_diroffset);
 	printf("on dev %s\n", devtoname(dep->de_pmp->pm_dev));
 	return (0);
 }
 
 static int
 msdosfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct msdosfsmount *pmp = VTODE(ap->a_vp)->de_pmp;
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 32;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = pmp->pm_flags & MSDOSFSMNT_LONGNAME ? WIN_MAXLEN : 12;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 0;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 static int
 msdosfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct denode *dep;
 	struct defid *defhp;
 
 	dep = VTODE(ap->a_vp);
 	defhp = (struct defid *)ap->a_fhp;
 	defhp->defid_len = sizeof(struct defid);
 	defhp->defid_dirclust = dep->de_dirclust;
 	defhp->defid_dirofs = dep->de_diroffset;
 	/* defhp->defid_gen = dep->de_gen; */
 	return (0);
 }
 
 /* Global vfs data structures for msdosfs */
 struct vop_vector msdosfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		msdosfs_access,
 	.vop_bmap =		msdosfs_bmap,
 	.vop_getpages =		msdosfs_getpages,
 	.vop_cachedlookup =	msdosfs_lookup,
 	.vop_open =		msdosfs_open,
 	.vop_close =		msdosfs_close,
 	.vop_create =		msdosfs_create,
 	.vop_fsync =		msdosfs_fsync,
 	.vop_fdatasync =	vop_stdfdatasync_buf,
 	.vop_getattr =		msdosfs_getattr,
 	.vop_inactive =		msdosfs_inactive,
 	.vop_link =		msdosfs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		msdosfs_mkdir,
 	.vop_mknod =		msdosfs_mknod,
 	.vop_pathconf =		msdosfs_pathconf,
 	.vop_print =		msdosfs_print,
 	.vop_read =		msdosfs_read,
 	.vop_readdir =		msdosfs_readdir,
 	.vop_reclaim =		msdosfs_reclaim,
 	.vop_remove =		msdosfs_remove,
 	.vop_rename =		msdosfs_rename,
 	.vop_rmdir =		msdosfs_rmdir,
 	.vop_setattr =		msdosfs_setattr,
 	.vop_strategy =		msdosfs_strategy,
 	.vop_symlink =		msdosfs_symlink,
 	.vop_write =		msdosfs_write,
 	.vop_vptofh =		msdosfs_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(msdosfs_vnodeops);
Index: head/sys/fs/nfsclient/nfs_clvnops.c
===================================================================
--- head/sys/fs/nfsclient/nfs_clvnops.c	(revision 355789)
+++ head/sys/fs/nfsclient/nfs_clvnops.c	(revision 355790)
@@ -1,4290 +1,4294 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from nfs_vnops.c	8.16 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * vnode op calls for Sun NFS version 2, 3 and 4
  */
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/extattr.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 #include <fs/nfs/nfsport.h>
 #include <fs/nfsclient/nfsnode.h>
 #include <fs/nfsclient/nfsmount.h>
 #include <fs/nfsclient/nfs.h>
 #include <fs/nfsclient/nfs_kdtrace.h>
 
 #include <net/if.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 
 #include <nfs/nfs_lock.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 
 dtrace_nfsclient_accesscache_flush_probe_func_t
 		dtrace_nfscl_accesscache_flush_done_probe;
 uint32_t	nfscl_accesscache_flush_done_id;
 
 dtrace_nfsclient_accesscache_get_probe_func_t
 		dtrace_nfscl_accesscache_get_hit_probe,
 		dtrace_nfscl_accesscache_get_miss_probe;
 uint32_t	nfscl_accesscache_get_hit_id;
 uint32_t	nfscl_accesscache_get_miss_id;
 
 dtrace_nfsclient_accesscache_load_probe_func_t
 		dtrace_nfscl_accesscache_load_done_probe;
 uint32_t	nfscl_accesscache_load_done_id;
 #endif /* !KDTRACE_HOOKS */
 
 /* Defs */
 #define	TRUE	1
 #define	FALSE	0
 
 extern struct nfsstatsv1 nfsstatsv1;
 extern int nfsrv_useacl;
 extern int nfscl_debuglevel;
 MALLOC_DECLARE(M_NEWNFSREQ);
 
 static vop_read_t	nfsfifo_read;
 static vop_write_t	nfsfifo_write;
 static vop_close_t	nfsfifo_close;
 static int	nfs_setattrrpc(struct vnode *, struct vattr *, struct ucred *,
 		    struct thread *);
 static vop_lookup_t	nfs_lookup;
 static vop_create_t	nfs_create;
 static vop_mknod_t	nfs_mknod;
 static vop_open_t	nfs_open;
 static vop_pathconf_t	nfs_pathconf;
 static vop_close_t	nfs_close;
 static vop_access_t	nfs_access;
 static vop_getattr_t	nfs_getattr;
 static vop_setattr_t	nfs_setattr;
 static vop_read_t	nfs_read;
 static vop_fsync_t	nfs_fsync;
 static vop_remove_t	nfs_remove;
 static vop_link_t	nfs_link;
 static vop_rename_t	nfs_rename;
 static vop_mkdir_t	nfs_mkdir;
 static vop_rmdir_t	nfs_rmdir;
 static vop_symlink_t	nfs_symlink;
 static vop_readdir_t	nfs_readdir;
 static vop_strategy_t	nfs_strategy;
 static	int	nfs_lookitup(struct vnode *, char *, int,
 		    struct ucred *, struct thread *, struct nfsnode **);
 static	int	nfs_sillyrename(struct vnode *, struct vnode *,
 		    struct componentname *);
 static vop_access_t	nfsspec_access;
 static vop_readlink_t	nfs_readlink;
 static vop_print_t	nfs_print;
 static vop_advlock_t	nfs_advlock;
 static vop_advlockasync_t nfs_advlockasync;
 static vop_getacl_t nfs_getacl;
 static vop_setacl_t nfs_setacl;
 static vop_advise_t nfs_advise;
 static vop_allocate_t nfs_allocate;
 static vop_copy_file_range_t nfs_copy_file_range;
 static vop_ioctl_t nfs_ioctl;
 static vop_getextattr_t nfs_getextattr;
 static vop_setextattr_t nfs_setextattr;
 static vop_listextattr_t nfs_listextattr;
 static vop_deleteextattr_t nfs_deleteextattr;
 static vop_lock1_t	nfs_lock;
 
 /*
  * Global vfs data structures for nfs
  */
 
 static struct vop_vector newnfs_vnodeops_nosig = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		nfs_access,
 	.vop_advlock =		nfs_advlock,
 	.vop_advlockasync =	nfs_advlockasync,
 	.vop_close =		nfs_close,
 	.vop_create =		nfs_create,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_getpages =		ncl_getpages,
 	.vop_putpages =		ncl_putpages,
 	.vop_inactive =		ncl_inactive,
 	.vop_link =		nfs_link,
 	.vop_lock1 =		nfs_lock,
 	.vop_lookup =		nfs_lookup,
 	.vop_mkdir =		nfs_mkdir,
 	.vop_mknod =		nfs_mknod,
 	.vop_open =		nfs_open,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfs_read,
 	.vop_readdir =		nfs_readdir,
 	.vop_readlink =		nfs_readlink,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_remove =		nfs_remove,
 	.vop_rename =		nfs_rename,
 	.vop_rmdir =		nfs_rmdir,
 	.vop_setattr =		nfs_setattr,
 	.vop_strategy =		nfs_strategy,
 	.vop_symlink =		nfs_symlink,
 	.vop_write =		ncl_write,
 	.vop_getacl =		nfs_getacl,
 	.vop_setacl =		nfs_setacl,
 	.vop_advise =		nfs_advise,
 	.vop_allocate =		nfs_allocate,
 	.vop_copy_file_range =	nfs_copy_file_range,
 	.vop_ioctl =		nfs_ioctl,
 	.vop_getextattr =	nfs_getextattr,
 	.vop_setextattr =	nfs_setextattr,
 	.vop_listextattr =	nfs_listextattr,
 	.vop_deleteextattr =	nfs_deleteextattr,
 };
+VFS_VOP_VECTOR_REGISTER(newnfs_vnodeops_nosig);
 
 static int
 nfs_vnodeops_bypass(struct vop_generic_args *a)
 {
 
 	return (vop_sigdefer(&newnfs_vnodeops_nosig, a));
 }
 
 struct vop_vector newnfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_bypass =		nfs_vnodeops_bypass,
 };
+VFS_VOP_VECTOR_REGISTER(newnfs_vnodeops);
 
 static struct vop_vector newnfs_fifoops_nosig = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		nfsspec_access,
 	.vop_close =		nfsfifo_close,
 	.vop_fsync =		nfs_fsync,
 	.vop_getattr =		nfs_getattr,
 	.vop_inactive =		ncl_inactive,
 	.vop_pathconf =		nfs_pathconf,
 	.vop_print =		nfs_print,
 	.vop_read =		nfsfifo_read,
 	.vop_reclaim =		ncl_reclaim,
 	.vop_setattr =		nfs_setattr,
 	.vop_write =		nfsfifo_write,
 };
+VFS_VOP_VECTOR_REGISTER(newnfs_fifoops_nosig);
 
 static int
 nfs_fifoops_bypass(struct vop_generic_args *a)
 {
 
 	return (vop_sigdefer(&newnfs_fifoops_nosig, a));
 }
 
 struct vop_vector newnfs_fifoops = {
 	.vop_default =		&default_vnodeops,
 	.vop_bypass =		nfs_fifoops_bypass,
 };
+VFS_VOP_VECTOR_REGISTER(newnfs_fifoops);
 
 static int nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp, struct vattr *vap);
 static int nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td);
 static int nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp,
     char *fnameptr, int fnamelen, struct vnode *tdvp, struct vnode *tvp,
     char *tnameptr, int tnamelen, struct ucred *cred, struct thread *td);
 static int nfs_renameit(struct vnode *sdvp, struct vnode *svp,
     struct componentname *scnp, struct sillyrename *sp);
 
 /*
  * Global variables
  */
 SYSCTL_DECL(_vfs_nfs);
 
 static int	nfsaccess_cache_timeout = NFS_MAXATTRTIMO;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, access_cache_timeout, CTLFLAG_RW,
 	   &nfsaccess_cache_timeout, 0, "NFS ACCESS cache timeout");
 
 static int	nfs_prime_access_cache = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, prime_access_cache, CTLFLAG_RW,
 	   &nfs_prime_access_cache, 0,
 	   "Prime NFS ACCESS cache when fetching attributes");
 
 static int	newnfs_commit_on_close = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, commit_on_close, CTLFLAG_RW,
     &newnfs_commit_on_close, 0, "write+commit on close, else only write");
 
 static int	nfs_clean_pages_on_close = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, clean_pages_on_close, CTLFLAG_RW,
 	   &nfs_clean_pages_on_close, 0, "NFS clean dirty pages on close");
 
 int newnfs_directio_enable = 0;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_enable, CTLFLAG_RW,
 	   &newnfs_directio_enable, 0, "Enable NFS directio");
 
 int nfs_keep_dirty_on_error;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_keep_dirty_on_error, CTLFLAG_RW,
     &nfs_keep_dirty_on_error, 0, "Retry pageout if error returned");
 
 /*
  * This sysctl allows other processes to mmap a file that has been opened
  * O_DIRECT by a process.  In general, having processes mmap the file while
  * Direct IO is in progress can lead to Data Inconsistencies.  But, we allow
  * this by default to prevent DoS attacks - to prevent a malicious user from
  * opening up files O_DIRECT preventing other users from mmap'ing these
  * files.  "Protected" environments where stricter consistency guarantees are
  * required can disable this knob.  The process that opened the file O_DIRECT
  * cannot mmap() the file, because mmap'ed IO on an O_DIRECT open() is not
  * meaningful.
  */
 int newnfs_directio_allow_mmap = 1;
 SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_directio_allow_mmap, CTLFLAG_RW,
 	   &newnfs_directio_allow_mmap, 0, "Enable mmaped IO on file with O_DIRECT opens");
 
 #define	NFSACCESS_ALL (NFSACCESS_READ | NFSACCESS_MODIFY		\
 			 | NFSACCESS_EXTEND | NFSACCESS_EXECUTE	\
 			 | NFSACCESS_DELETE | NFSACCESS_LOOKUP)
 
 /*
  * SMP Locking Note :
  * The list of locks after the description of the lock is the ordering
  * of other locks acquired with the lock held.
  * np->n_mtx : Protects the fields in the nfsnode.
        VM Object Lock
        VI_MTX (acquired indirectly)
  * nmp->nm_mtx : Protects the fields in the nfsmount.
        rep->r_mtx
  * ncl_iod_mutex : Global lock, protects shared nfsiod state.
  * nfs_reqq_mtx : Global lock, protects the nfs_reqq list.
        nmp->nm_mtx
        rep->r_mtx
  * rep->r_mtx : Protects the fields in an nfsreq.
  */
 
 static int
 nfs_lock(struct vop_lock1_args *ap)
 {
 	struct vnode *vp;
 	struct nfsnode *np;
 	u_quad_t nsize;
 	int error, lktype;
 	bool onfault;
 
 	vp = ap->a_vp;
 	lktype = ap->a_flags & LK_TYPE_MASK;
 	error = VOP_LOCK1_APV(&default_vnodeops, ap);
 	if (error != 0 || vp->v_op != &newnfs_vnodeops)
 		return (error);
 	np = VTONFS(vp);
 	if (np == NULL)
 		return (0);
 	NFSLOCKNODE(np);
 	if ((np->n_flag & NVNSETSZSKIP) == 0 || (lktype != LK_SHARED &&
 	    lktype != LK_EXCLUSIVE && lktype != LK_UPGRADE &&
 	    lktype != LK_TRYUPGRADE)) {
 		NFSUNLOCKNODE(np);
 		return (0);
 	}
 	onfault = (ap->a_flags & LK_EATTR_MASK) == LK_NOWAIT &&
 	    (ap->a_flags & LK_INIT_MASK) == LK_CANRECURSE &&
 	    (lktype == LK_SHARED || lktype == LK_EXCLUSIVE);
 	if (onfault && vp->v_vnlock->lk_recurse == 0) {
 		/*
 		 * Force retry in vm_fault(), to make the lock request
 		 * sleepable, which allows us to piggy-back the
 		 * sleepable call to vnode_pager_setsize().
 		 */
 		NFSUNLOCKNODE(np);
 		VOP_UNLOCK(vp, 0);
 		return (EBUSY);
 	}
 	if ((ap->a_flags & LK_NOWAIT) != 0 ||
 	    (lktype == LK_SHARED && vp->v_vnlock->lk_recurse > 0)) {
 		NFSUNLOCKNODE(np);
 		return (0);
 	}
 	if (lktype == LK_SHARED) {
 		NFSUNLOCKNODE(np);
 		VOP_UNLOCK(vp, 0);
 		ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK);
 		ap->a_flags |= LK_EXCLUSIVE;
 		error = VOP_LOCK1_APV(&default_vnodeops, ap);
 		if (error != 0 || vp->v_op != &newnfs_vnodeops)
 			return (error);
 		if (vp->v_data == NULL)
 			goto downgrade;
 		MPASS(vp->v_data == np);
 		NFSLOCKNODE(np);
 		if ((np->n_flag & NVNSETSZSKIP) == 0) {
 			NFSUNLOCKNODE(np);
 			goto downgrade;
 		}
 	}
 	np->n_flag &= ~NVNSETSZSKIP;
 	nsize = np->n_size;
 	NFSUNLOCKNODE(np);
 	vnode_pager_setsize(vp, nsize);
 downgrade:
 	if (lktype == LK_SHARED) {
 		ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK);
 		ap->a_flags |= LK_DOWNGRADE;
 		(void)VOP_LOCK1_APV(&default_vnodeops, ap);
 	}
 	return (0);
 }
 
 static int
 nfs34_access_otw(struct vnode *vp, int wmode, struct thread *td,
     struct ucred *cred, u_int32_t *retmode)
 {
 	int error = 0, attrflag, i, lrupos;
 	u_int32_t rmode;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_accessrpc(vp, wmode, cred, td, &nfsva, &attrflag,
 	    &rmode, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		lrupos = 0;
 		NFSLOCKNODE(np);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (np->n_accesscache[i].uid == cred->cr_uid) {
 				np->n_accesscache[i].mode = rmode;
 				np->n_accesscache[i].stamp = time_second;
 				break;
 			}
 			if (i > 0 && np->n_accesscache[i].stamp <
 			    np->n_accesscache[lrupos].stamp)
 				lrupos = i;
 		}
 		if (i == NFS_ACCESSCACHESIZE) {
 			np->n_accesscache[lrupos].uid = cred->cr_uid;
 			np->n_accesscache[lrupos].mode = rmode;
 			np->n_accesscache[lrupos].stamp = time_second;
 		}
 		NFSUNLOCKNODE(np);
 		if (retmode != NULL)
 			*retmode = rmode;
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, rmode, 0);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 #ifdef KDTRACE_HOOKS
 	if (error != 0)
 		KDTRACE_NFS_ACCESSCACHE_LOAD_DONE(vp, cred->cr_uid, 0,
 		    error);
 #endif
 	return (error);
 }
 
 /*
  * nfs access vnode op.
  * For nfs version 2, just return ok. File accesses may fail later.
  * For nfs version 3, use the access rpc to check accessibility. If file modes
  * are changed on the server, accesses might still fail later.
  */
 static int
 nfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int error = 0, i, gotahit;
 	u_int32_t mode, wmode, rmode;
 	int v34 = NFS_ISV34(vp);
 	struct nfsnode *np = VTONFS(vp);
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((ap->a_accmode & (VWRITE | VAPPEND | VWRITE_NAMED_ATTRS |
 	    VDELETE_CHILD | VWRITE_ATTRIBUTES | VDELETE | VWRITE_ACL |
 	    VWRITE_OWNER)) != 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	/*
 	 * For nfs v3 or v4, check to see if we have done this recently, and if
 	 * so return our cached result instead of making an ACCESS call.
 	 * If not, do an access rpc, otherwise you are stuck emulating
 	 * ufs_access() locally using the vattr. This may not be correct,
 	 * since the server may apply other access criteria such as
 	 * client uid-->server uid mapping that we do not know about.
 	 */
 	if (v34) {
 		if (ap->a_accmode & VREAD)
 			mode = NFSACCESS_READ;
 		else
 			mode = 0;
 		if (vp->v_type != VDIR) {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_EXECUTE;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 		} else {
 			if (ap->a_accmode & VWRITE)
 				mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND);
 			if (ap->a_accmode & VAPPEND)
 				mode |= NFSACCESS_EXTEND;
 			if (ap->a_accmode & VEXEC)
 				mode |= NFSACCESS_LOOKUP;
 			if (ap->a_accmode & VDELETE)
 				mode |= NFSACCESS_DELETE;
 			if (ap->a_accmode & VDELETE_CHILD)
 				mode |= NFSACCESS_MODIFY;
 		}
 		/* XXX safety belt, only make blanket request if caching */
 		if (nfsaccess_cache_timeout > 0) {
 			wmode = NFSACCESS_READ | NFSACCESS_MODIFY |
 				NFSACCESS_EXTEND | NFSACCESS_EXECUTE |
 				NFSACCESS_DELETE | NFSACCESS_LOOKUP;
 		} else {
 			wmode = mode;
 		}
 
 		/*
 		 * Does our cached result allow us to give a definite yes to
 		 * this request?
 		 */
 		gotahit = 0;
 		NFSLOCKNODE(np);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++) {
 			if (ap->a_cred->cr_uid == np->n_accesscache[i].uid) {
 			    if (time_second < (np->n_accesscache[i].stamp
 				+ nfsaccess_cache_timeout) &&
 				(np->n_accesscache[i].mode & mode) == mode) {
 				NFSINCRGLOBAL(nfsstatsv1.accesscache_hits);
 				gotahit = 1;
 			    }
 			    break;
 			}
 		}
 		NFSUNLOCKNODE(np);
 #ifdef KDTRACE_HOOKS
 		if (gotahit != 0)
 			KDTRACE_NFS_ACCESSCACHE_GET_HIT(vp,
 			    ap->a_cred->cr_uid, mode);
 		else
 			KDTRACE_NFS_ACCESSCACHE_GET_MISS(vp,
 			    ap->a_cred->cr_uid, mode);
 #endif
 		if (gotahit == 0) {
 			/*
 			 * Either a no, or a don't know.  Go to the wire.
 			 */
 			NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		        error = nfs34_access_otw(vp, wmode, ap->a_td,
 			    ap->a_cred, &rmode);
 			if (!error &&
 			    (rmode & mode) != mode)
 				error = EACCES;
 		}
 		return (error);
 	} else {
 		if ((error = nfsspec_access(ap)) != 0) {
 			return (error);
 		}
 		/*
 		 * Attempt to prevent a mapped root from accessing a file
 		 * which it shouldn't.  We try to read a byte from the file
 		 * if the user is root and the file is not zero length.
 		 * After calling nfsspec_access, we should have the correct
 		 * file size cached.
 		 */
 		NFSLOCKNODE(np);
 		if (ap->a_cred->cr_uid == 0 && (ap->a_accmode & VREAD)
 		    && VTONFS(vp)->n_size > 0) {
 			struct iovec aiov;
 			struct uio auio;
 			char buf[1];
 
 			NFSUNLOCKNODE(np);
 			aiov.iov_base = buf;
 			aiov.iov_len = 1;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = 0;
 			auio.uio_resid = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = ap->a_td;
 
 			if (vp->v_type == VREG)
 				error = ncl_readrpc(vp, &auio, ap->a_cred);
 			else if (vp->v_type == VDIR) {
 				char* bp;
 				bp = malloc(NFS_DIRBLKSIZ, M_TEMP, M_WAITOK);
 				aiov.iov_base = bp;
 				aiov.iov_len = auio.uio_resid = NFS_DIRBLKSIZ;
 				error = ncl_readdirrpc(vp, &auio, ap->a_cred,
 				    ap->a_td);
 				free(bp, M_TEMP);
 			} else if (vp->v_type == VLNK)
 				error = ncl_readlinkrpc(vp, &auio, ap->a_cred);
 			else
 				error = EACCES;
 		} else
 			NFSUNLOCKNODE(np);
 		return (error);
 	}
 }
 
 
 /*
  * nfs open vnode op
  * Check to see if the type is ok
  * and that deletion is not in progress.
  * For paged in text files, you will need to flush the page cache
  * if consistency is lost.
  */
 /* ARGSUSED */
 static int
 nfs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	int error;
 	int fmode = ap->a_mode;
 	struct ucred *cred;
 	vm_object_t obj;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK)
 		return (EOPNOTSUPP);
 
 	/*
 	 * For NFSv4, we need to do the Open Op before cache validation,
 	 * so that we conform to RFC3530 Sec. 9.3.1.
 	 */
 	if (NFS_ISV4(vp)) {
 		error = nfsrpc_open(vp, fmode, ap->a_cred, ap->a_td);
 		if (error) {
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 			return (error);
 		}
 	}
 
 	/*
 	 * Now, if this Open will be doing reading, re-validate/flush the
 	 * cache, so that Close/Open coherency is maintained.
 	 */
 	NFSLOCKNODE(np);
 	if (np->n_flag & NMODIFIED) {
 		NFSUNLOCKNODE(np);
 		error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		if (error == EINTR || error == EIO) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		NFSLOCKNODE(np);
 		np->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 		if (vp->v_type == VDIR)
 			np->n_direofoffset = 0;
 		NFSUNLOCKNODE(np);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		NFSLOCKNODE(np);
 		np->n_mtime = vattr.va_mtime;
 		if (NFS_ISV4(vp))
 			np->n_change = vattr.va_filerev;
 	} else {
 		NFSUNLOCKNODE(np);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error) {
 			if (NFS_ISV4(vp))
 				(void) nfsrpc_close(vp, 0, ap->a_td);
 			return (error);
 		}
 		NFSLOCKNODE(np);
 		if ((NFS_ISV4(vp) && np->n_change != vattr.va_filerev) ||
 		    NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 			if (vp->v_type == VDIR)
 				np->n_direofoffset = 0;
 			NFSUNLOCKNODE(np);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error == EINTR || error == EIO) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			NFSLOCKNODE(np);
 			np->n_mtime = vattr.va_mtime;
 			if (NFS_ISV4(vp))
 				np->n_change = vattr.va_filerev;
 		}
 	}
 
 	/*
 	 * If the object has >= 1 O_DIRECT active opens, we disable caching.
 	 */
 	if (newnfs_directio_enable && (fmode & O_DIRECT) &&
 	    (vp->v_type == VREG)) {
 		if (np->n_directio_opens == 0) {
 			NFSUNLOCKNODE(np);
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 			if (error) {
 				if (NFS_ISV4(vp))
 					(void) nfsrpc_close(vp, 0, ap->a_td);
 				return (error);
 			}
 			NFSLOCKNODE(np);
 			np->n_flag |= NNONCACHE;
 		}
 		np->n_directio_opens++;
 	}
 
 	/* If opened for writing via NFSv4.1 or later, mark that for pNFS. */
 	if (NFSHASPNFS(VFSTONFS(vp->v_mount)) && (fmode & FWRITE) != 0)
 		np->n_flag |= NWRITEOPENED;
 
 	/*
 	 * If this is an open for writing, capture a reference to the
 	 * credentials, so they can be used by ncl_putpages(). Using
 	 * these write credentials is preferable to the credentials of
 	 * whatever thread happens to be doing the VOP_PUTPAGES() since
 	 * the write RPCs are less likely to fail with EACCES.
 	 */
 	if ((fmode & FWRITE) != 0) {
 		cred = np->n_writecred;
 		np->n_writecred = crhold(ap->a_cred);
 	} else
 		cred = NULL;
 	NFSUNLOCKNODE(np);
 
 	if (cred != NULL)
 		crfree(cred);
 	vnode_create_vobject(vp, vattr.va_size, ap->a_td);
 
 	/*
 	 * If the text file has been mmap'd, flush any dirty pages to the
 	 * buffer cache and then...
 	 * Make sure all writes are pushed to the NFS server.  If this is not
 	 * done, the modify time of the file can change while the text
 	 * file is being executed.  This will cause the process that is
 	 * executing the text file to be terminated.
 	 */
 	if (vp->v_writecount <= -1) {
 		if ((obj = vp->v_object) != NULL &&
 		    vm_object_mightbedirty(obj)) {
 			VM_OBJECT_WLOCK(obj);
 			vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_WUNLOCK(obj);
 		}
 
 		/* Now, flush the buffer cache. */
 		ncl_flush(vp, MNT_WAIT, curthread, 0, 0);
 
 		/* And, finally, make sure that n_mtime is up to date. */
 		np = VTONFS(vp);
 		NFSLOCKNODE(np);
 		np->n_mtime = np->n_vattr.na_mtime;
 		NFSUNLOCKNODE(np);
 	}
 	return (0);
 }
 
 /*
  * nfs close vnode op
  * What an NFS client should do upon close after writing is a debatable issue.
  * Most NFS clients push delayed writes to the server upon close, basically for
  * two reasons:
  * 1 - So that any write errors may be reported back to the client process
  *     doing the close system call. By far the two most likely errors are
  *     NFSERR_NOSPC and NFSERR_DQUOT to indicate space allocation failure.
  * 2 - To put a worst case upper bound on cache inconsistency between
  *     multiple clients for the file.
  * There is also a consistency problem for Version 2 of the protocol w.r.t.
  * not being able to tell if other clients are writing a file concurrently,
  * since there is no way of knowing if the changed modify time in the reply
  * is only due to the write for this client.
  * (NFS Version 3 provides weak cache consistency data in the reply that
  *  should be sufficient to detect and handle this case.)
  *
  * The current code does the following:
  * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers
  * for NFS Version 3 - flush dirty buffers to the server but don't invalidate
  *                     or commit them (this satisfies 1 and 2 except for the
  *                     case where the server crashes after this close but
  *                     before the commit RPC, which is felt to be "good
  *                     enough". Changing the last argument to ncl_flush() to
  *                     a 1 would force a commit operation, if it is felt a
  *                     commit is necessary now.
  * for NFS Version 4 - flush the dirty buffers and commit them, if
  *		       nfscl_mustflush() says this is necessary.
  *                     It is necessary if there is no write delegation held,
  *                     in order to satisfy open/close coherency.
  *                     If the file isn't cached on local stable storage,
  *                     it may be necessary in order to detect "out of space"
  *                     errors from the server, if the write delegation
  *                     issued by the server doesn't allow the file to grow.
  */
 /* ARGSUSED */
 static int
 nfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsvattr nfsva;
 	struct ucred *cred;
 	int error = 0, ret, localcred = 0;
 	int fmode = ap->a_fflag;
 
 	if (NFSCL_FORCEDISM(vp->v_mount))
 		return (0);
 	/*
 	 * During shutdown, a_cred isn't valid, so just use root.
 	 */
 	if (ap->a_cred == NOCRED) {
 		cred = newnfs_getcred();
 		localcred = 1;
 	} else {
 		cred = ap->a_cred;
 	}
 	if (vp->v_type == VREG) {
 	    /*
 	     * Examine and clean dirty pages, regardless of NMODIFIED.
 	     * This closes a major hole in close-to-open consistency.
 	     * We want to push out all dirty pages (and buffers) on
 	     * close, regardless of whether they were dirtied by
 	     * mmap'ed writes or via write().
 	     */
 	    if (nfs_clean_pages_on_close && vp->v_object) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	    }
 	    NFSLOCKNODE(np);
 	    if (np->n_flag & NMODIFIED) {
 		NFSUNLOCKNODE(np);
 		if (NFS_ISV3(vp)) {
 		    /*
 		     * Under NFSv3 we have dirty buffers to dispose of.  We
 		     * must flush them to the NFS server.  We have the option
 		     * of waiting all the way through the commit rpc or just
 		     * waiting for the initial write.  The default is to only
 		     * wait through the initial write so the data is in the
 		     * server's cache, which is roughly similar to the state
 		     * a standard disk subsystem leaves the file in on close().
 		     *
 		     * We cannot clear the NMODIFIED bit in np->n_flag due to
 		     * potential races with other processes, and certainly
 		     * cannot clear it if we don't commit.
 		     * These races occur when there is no longer the old
 		     * traditional vnode locking implemented for Vnode Ops.
 		     */
 		    int cm = newnfs_commit_on_close ? 1 : 0;
 		    error = ncl_flush(vp, MNT_WAIT, ap->a_td, cm, 0);
 		    /* np->n_flag &= ~NMODIFIED; */
 		} else if (NFS_ISV4(vp)) { 
 			if (nfscl_mustflush(vp) != 0) {
 				int cm = newnfs_commit_on_close ? 1 : 0;
 				error = ncl_flush(vp, MNT_WAIT, ap->a_td,
 				    cm, 0);
 				/*
 				 * as above w.r.t races when clearing
 				 * NMODIFIED.
 				 * np->n_flag &= ~NMODIFIED;
 				 */
 			}
 		} else {
 			error = ncl_vinvalbuf(vp, V_SAVE, ap->a_td, 1);
 		}
 		NFSLOCKNODE(np);
 	    }
  	    /* 
  	     * Invalidate the attribute cache in all cases.
  	     * An open is going to fetch fresh attrs any way, other procs
  	     * on this node that have file open will be forced to do an 
  	     * otw attr fetch, but this is safe.
 	     * --> A user found that their RPC count dropped by 20% when
 	     *     this was commented out and I can't see any requirement
 	     *     for it, so I've disabled it when negative lookups are
 	     *     enabled. (What does this have to do with negative lookup
 	     *     caching? Well nothing, except it was reported by the
 	     *     same user that needed negative lookup caching and I wanted
 	     *     there to be a way to disable it to see if it
 	     *     is the cause of some caching/coherency issue that might
 	     *     crop up.)
  	     */
 	    if (VFSTONFS(vp->v_mount)->nm_negnametimeo == 0) {
 		    np->n_attrstamp = 0;
 		    KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	    }
 	    if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		error = np->n_error;
 	    }
 	    NFSUNLOCKNODE(np);
 	}
 
 	if (NFS_ISV4(vp)) {
 		/*
 		 * Get attributes so "change" is up to date.
 		 */
 		if (error == 0 && nfscl_mustflush(vp) != 0 &&
 		    vp->v_type == VREG &&
 		    (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOCTO) == 0) {
 			ret = nfsrpc_getattr(vp, cred, ap->a_td, &nfsva,
 			    NULL);
 			if (!ret) {
 				np->n_change = nfsva.na_filerev;
 				(void) nfscl_loadattrcache(&vp, &nfsva, NULL,
 				    NULL, 0, 0);
 			}
 		}
 
 		/*
 		 * and do the close.
 		 */
 		ret = nfsrpc_close(vp, 0, ap->a_td);
 		if (!error && ret)
 			error = ret;
 		if (error)
 			error = nfscl_maperr(ap->a_td, error, (uid_t)0,
 			    (gid_t)0);
 	}
 	if (newnfs_directio_enable)
 		KASSERT((np->n_directio_asyncwr == 0),
 			("nfs_close: dirty unflushed (%d) directio buffers\n",
 			 np->n_directio_asyncwr));
 	if (newnfs_directio_enable && (fmode & O_DIRECT) && (vp->v_type == VREG)) {
 		NFSLOCKNODE(np);
 		KASSERT((np->n_directio_opens > 0), 
 			("nfs_close: unexpectedly value (0) of n_directio_opens\n"));
 		np->n_directio_opens--;
 		if (np->n_directio_opens == 0)
 			np->n_flag &= ~NNONCACHE;
 		NFSUNLOCKNODE(np);
 	}
 	if (localcred)
 		NFSFREECRED(cred);
 	return (error);
 }
 
 /*
  * nfs getattr call from vfs.
  */
 static int
 nfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;	/* XXX */
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct nfsvattr nfsva;
 	struct vattr *vap = ap->a_vap;
 	struct vattr vattr;
 
 	/*
 	 * Update local times for special files.
 	 */
 	NFSLOCKNODE(np);
 	if (np->n_flag & (NACC | NUPD))
 		np->n_flag |= NCHG;
 	NFSUNLOCKNODE(np);
 	/*
 	 * First look in the cache.
 	 */
 	if (ncl_getattrcache(vp, &vattr) == 0) {
 		vap->va_type = vattr.va_type;
 		vap->va_mode = vattr.va_mode;
 		vap->va_nlink = vattr.va_nlink;
 		vap->va_uid = vattr.va_uid;
 		vap->va_gid = vattr.va_gid;
 		vap->va_fsid = vattr.va_fsid;
 		vap->va_fileid = vattr.va_fileid;
 		vap->va_size = vattr.va_size;
 		vap->va_blocksize = vattr.va_blocksize;
 		vap->va_atime = vattr.va_atime;
 		vap->va_mtime = vattr.va_mtime;
 		vap->va_ctime = vattr.va_ctime;
 		vap->va_gen = vattr.va_gen;
 		vap->va_flags = vattr.va_flags;
 		vap->va_rdev = vattr.va_rdev;
 		vap->va_bytes = vattr.va_bytes;
 		vap->va_filerev = vattr.va_filerev;
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 		return (0);
 	}
 
 	if (NFS_ISV34(vp) && nfs_prime_access_cache &&
 	    nfsaccess_cache_timeout > 0) {
 		NFSINCRGLOBAL(nfsstatsv1.accesscache_misses);
 		nfs34_access_otw(vp, NFSACCESS_ALL, td, ap->a_cred, NULL);
 		if (ncl_getattrcache(vp, ap->a_vap) == 0) {
 			nfscl_deleggetmodtime(vp, &ap->a_vap->va_mtime);
 			return (0);
 		}
 	}
 	error = nfsrpc_getattr(vp, ap->a_cred, td, &nfsva, NULL);
 	if (!error)
 		error = nfscl_loadattrcache(&vp, &nfsva, vap, NULL, 0, 0);
 	if (!error) {
 		/*
 		 * Get the local modify time for the case of a write
 		 * delegation.
 		 */
 		nfscl_deleggetmodtime(vp, &vap->va_mtime);
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * nfs setattr call.
  */
 static int
 nfs_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct thread *td = curthread;	/* XXX */
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 	u_quad_t tsize;
 
 #ifndef nolint
 	tsize = (u_quad_t)0;
 #endif
 
 	/*
 	 * Setting of flags and marking of atimes are not supported.
 	 */
 	if (vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_mtime.tv_sec == VNOVAL &&
 			    vap->va_atime.tv_sec == VNOVAL &&
 			    vap->va_mode == (mode_t)VNOVAL &&
 			    vap->va_uid == (uid_t)VNOVAL &&
 			    vap->va_gid == (gid_t)VNOVAL)
 				return (0);		
  			vap->va_size = VNOVAL;
  			break;
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			/*
 			 *  We run vnode_pager_setsize() early (why?),
 			 * we must set np->n_size now to avoid vinvalbuf
 			 * V_SAVE races that might setsize a lower
 			 * value.
 			 */
 			NFSLOCKNODE(np);
 			tsize = np->n_size;
 			NFSUNLOCKNODE(np);
 			error = ncl_meta_setsize(vp, td, vap->va_size);
 			NFSLOCKNODE(np);
  			if (np->n_flag & NMODIFIED) {
 			    tsize = np->n_size;
 			    NFSUNLOCKNODE(np);
 			    error = ncl_vinvalbuf(vp, vap->va_size == 0 ?
 			        0 : V_SAVE, td, 1);
 			    if (error != 0) {
 				    vnode_pager_setsize(vp, tsize);
 				    return (error);
 			    }
 			    /*
 			     * Call nfscl_delegmodtime() to set the modify time
 			     * locally, as required.
 			     */
 			    nfscl_delegmodtime(vp);
  			} else
 			    NFSUNLOCKNODE(np);
 			/*
 			 * np->n_size has already been set to vap->va_size
 			 * in ncl_meta_setsize(). We must set it again since
 			 * nfs_loadattrcache() could be called through
 			 * ncl_meta_setsize() and could modify np->n_size.
 			 */
 			NFSLOCKNODE(np);
  			np->n_vattr.na_size = np->n_size = vap->va_size;
 			NFSUNLOCKNODE(np);
   		}
   	} else {
 		NFSLOCKNODE(np);
 		if ((vap->va_mtime.tv_sec != VNOVAL || vap->va_atime.tv_sec != VNOVAL) && 
 		    (np->n_flag & NMODIFIED) && vp->v_type == VREG) {
 			NFSUNLOCKNODE(np);
 			error = ncl_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error == EINTR || error == EIO)
 				return (error);
 		} else
 			NFSUNLOCKNODE(np);
 	}
 	error = nfs_setattrrpc(vp, vap, ap->a_cred, td);
 	if (error && vap->va_size != VNOVAL) {
 		NFSLOCKNODE(np);
 		np->n_size = np->n_vattr.na_size = tsize;
 		vnode_pager_setsize(vp, tsize);
 		NFSUNLOCKNODE(np);
 	}
 	return (error);
 }
 
 /*
  * Do an nfs setattr rpc.
  */
 static int
 nfs_setattrrpc(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int error, ret, attrflag, i;
 	struct nfsvattr nfsva;
 
 	if (NFS_ISV34(vp)) {
 		NFSLOCKNODE(np);
 		for (i = 0; i < NFS_ACCESSCACHESIZE; i++)
 			np->n_accesscache[i].stamp = 0;
 		np->n_flag |= NDELEGMOD;
 		NFSUNLOCKNODE(np);
 		KDTRACE_NFS_ACCESSCACHE_FLUSH_DONE(vp);
 	}
 	error = nfsrpc_setattr(vp, vap, NULL, cred, td, &nfsva, &attrflag,
 	    NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, vap->va_uid, vap->va_gid);
 	return (error);
 }
 
 /*
  * nfs lookup call, one step at a time...
  * First look in cache
  * If not found, unlock the directory nfsnode and do the rpc
  */
 static int
 nfs_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct mount *mp = dvp->v_mount;
 	int flags = cnp->cn_flags;
 	struct vnode *newvp;
 	struct nfsmount *nmp;
 	struct nfsnode *np, *newnp;
 	int error = 0, attrflag, dattrflag, ltype, ncticks;
 	struct thread *td = cnp->cn_thread;
 	struct nfsfh *nfhp;
 	struct nfsvattr dnfsva, nfsva;
 	struct vattr vattr;
 	struct timespec nctime;
 	
 	*vpp = NULLVP;
 	if ((flags & ISLASTCN) && (mp->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 	nmp = VFSTONFS(mp);
 	np = VTONFS(dvp);
 
 	/* For NFSv4, wait until any remove is done. */
 	NFSLOCKNODE(np);
 	while (NFSHASNFSV4(nmp) && (np->n_flag & NREMOVEINPROG)) {
 		np->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)np, &np->n_mtx, PZERO, "nfslkup", 0);
 	}
 	NFSUNLOCKNODE(np);
 
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return (error);
 	error = cache_lookup(dvp, vpp, cnp, &nctime, &ncticks);
 	if (error > 0 && error != ENOENT)
 		return (error);
 	if (error == -1) {
 		/*
 		 * Lookups of "." are special and always return the
 		 * current directory.  cache_lookup() already handles
 		 * associated locking bookkeeping, etc.
 		 */
 		if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 			/* XXX: Is this really correct? */
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 
 		/*
 		 * We only accept a positive hit in the cache if the
 		 * change time of the file matches our cached copy.
 		 * Otherwise, we discard the cache entry and fallback
 		 * to doing a lookup RPC.  We also only trust cache
 		 * entries for less than nm_nametimeo seconds.
 		 *
 		 * To better handle stale file handles and attributes,
 		 * clear the attribute cache of this node if it is a
 		 * leaf component, part of an open() call, and not
 		 * locally modified before fetching the attributes.
 		 * This should allow stale file handles to be detected
 		 * here where we can fall back to a LOOKUP RPC to
 		 * recover rather than having nfs_open() detect the
 		 * stale file handle and failing open(2) with ESTALE.
 		 */
 		newvp = *vpp;
 		newnp = VTONFS(newvp);
 		if (!(nmp->nm_flag & NFSMNT_NOCTO) &&
 		    (flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(newnp->n_flag & NMODIFIED)) {
 			NFSLOCKNODE(newnp);
 			newnp->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			NFSUNLOCKNODE(newnp);
 		}
 		if (nfscl_nodeleg(newvp, 0) == 0 ||
 		    ((u_int)(ticks - ncticks) < (nmp->nm_nametimeo * hz) &&
 		    VOP_GETATTR(newvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_ctime, &nctime, ==))) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			if (cnp->cn_nameiop != LOOKUP &&
 			    (flags & ISLASTCN))
 				cnp->cn_flags |= SAVENAME;
 			return (0);
 		}
 		cache_purge(newvp);
 		if (dvp != newvp)
 			vput(newvp);
 		else 
 			vrele(newvp);
 		*vpp = NULLVP;
 	} else if (error == ENOENT) {
 		if (VN_IS_DOOMED(dvp))
 			return (ENOENT);
 		/*
 		 * We only accept a negative hit in the cache if the
 		 * modification time of the parent directory matches
 		 * the cached copy in the name cache entry.
 		 * Otherwise, we discard all of the negative cache
 		 * entries for this directory.  We also only trust
 		 * negative cache entries for up to nm_negnametimeo
 		 * seconds.
 		 */
 		if ((u_int)(ticks - ncticks) < (nmp->nm_negnametimeo * hz) &&
 		    VOP_GETATTR(dvp, &vattr, cnp->cn_cred) == 0 &&
 		    timespeccmp(&vattr.va_mtime, &nctime, ==)) {
 			NFSINCRGLOBAL(nfsstatsv1.lookupcache_hits);
 			return (ENOENT);
 		}
 		cache_purge_negative(dvp);
 	}
 
 	newvp = NULLVP;
 	NFSINCRGLOBAL(nfsstatsv1.lookupcache_misses);
 	error = nfsrpc_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, td, &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 	    NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (error) {
 		if (newvp != NULLVP) {
 			vput(newvp);
 			*vpp = NULLVP;
 		}
 
 		if (error != ENOENT) {
 			if (NFS_ISV4(dvp))
 				error = nfscl_maperr(td, error, (uid_t)0,
 				    (gid_t)0);
 			return (error);
 		}
 
 		/* The requested file was not found. */
 		if ((cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME) &&
 		    (flags & ISLASTCN)) {
 			/*
 			 * XXX: UFS does a full VOP_ACCESS(dvp,
 			 * VWRITE) here instead of just checking
 			 * MNT_RDONLY.
 			 */
 			if (mp->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 
 		if ((cnp->cn_flags & MAKEENTRY) != 0 && dattrflag) {
 			/*
 			 * Cache the modification time of the parent
 			 * directory from the post-op attributes in
 			 * the name cache entry.  The negative cache
 			 * entry will be ignored once the directory
 			 * has changed.  Don't bother adding the entry
 			 * if the directory has already changed.
 			 */
 			NFSLOCKNODE(np);
 			if (timespeccmp(&np->n_vattr.na_mtime,
 			    &dnfsva.na_mtime, ==)) {
 				NFSUNLOCKNODE(np);
 				cache_enter_time(dvp, NULL, cnp,
 				    &dnfsva.na_mtime, NULL);
 			} else
 				NFSUNLOCKNODE(np);
 		}
 		return (ENOENT);
 	}
 
 	/*
 	 * Handle RENAME case...
 	 */
 	if (cnp->cn_nameiop == RENAME && (flags & ISLASTCN)) {
 		if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 			free(nfhp, M_NFSFH);
 			return (EISDIR);
 		}
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		*vpp = newvp;
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		ltype = NFSVOPISLOCKED(dvp);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			NFSVOPUNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 			vfs_rel(mp);
 			if (error == 0 && VN_IS_DOOMED(dvp)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 			}
 			if (error != 0)
 				return (error);
 		}
 		NFSVOPUNLOCK(dvp, 0);
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error == 0)
 			newvp = NFSTOV(np);
 		vfs_unbusy(mp);
 		if (newvp != dvp)
 			NFSVOPLOCK(dvp, ltype | LK_RETRY);
 		if (VN_IS_DOOMED(dvp)) {
 			if (error == 0) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 			error = ENOENT;
 		}
 		if (error != 0)
 			return (error);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (NFS_CMPFH(np, nfhp->nfh_fh, nfhp->nfh_len)) {
 		free(nfhp, M_NFSFH);
 		VREF(dvp);
 		newvp = dvp;
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else {
 		error = nfscl_nget(mp, dvp, nfhp, cnp, td, &np, NULL,
 		    cnp->cn_lkflags);
 		if (error)
 			return (error);
 		newvp = NFSTOV(np);
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 		else if ((flags & (ISLASTCN | ISOPEN)) == (ISLASTCN | ISOPEN) &&
 		    !(np->n_flag & NMODIFIED)) {			
 			/*
 			 * Flush the attribute cache when opening a
 			 * leaf node to ensure that fresh attributes
 			 * are fetched in nfs_open() since we did not
 			 * fetch attributes from the LOOKUP reply.
 			 */
 			NFSLOCKNODE(np);
 			np->n_attrstamp = 0;
 			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(newvp);
 			NFSUNLOCKNODE(np);
 		}
 	}
 	if (cnp->cn_nameiop != LOOKUP && (flags & ISLASTCN))
 		cnp->cn_flags |= SAVENAME;
 	if ((cnp->cn_flags & MAKEENTRY) &&
 	    (cnp->cn_nameiop != DELETE || !(flags & ISLASTCN)) &&
 	    attrflag != 0 && (newvp->v_type != VDIR || dattrflag != 0))
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 		    newvp->v_type != VDIR ? NULL : &dnfsva.na_ctime);
 	*vpp = newvp;
 	return (0);
 }
 
 /*
  * nfs read call.
  * Just call ncl_bioread() to do the work.
  */
 static int
 nfs_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	switch (vp->v_type) {
 	case VREG:
 		return (ncl_bioread(vp, ap->a_uio, ap->a_ioflag, ap->a_cred));
 	case VDIR:
 		return (EISDIR);
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * nfs readlink call
  */
 static int
 nfs_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (vp->v_type != VLNK)
 		return (EINVAL);
 	return (ncl_bioread(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Do a readlink rpc.
  * Called by ncl_doio() from below the buffer cache.
  */
 int
 ncl_readlinkrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 
 	error = nfsrpc_readlink(vp, uiop, cred, uiop->uio_td, &nfsva,
 	    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs read rpc call
  * Ditto above
  */
 int
 ncl_readrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	int error, ret, attrflag;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, NULL, NULL,
 		    NFSV4OPEN_ACCESSREAD, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "readrpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_read(vp, uiop, cred, uiop->uio_td, &nfsva,
 		    &attrflag, NULL);
 	if (attrflag) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs write call
  */
 int
 ncl_writerpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     int *iomode, int *must_commit, int called_from_strategy)
 {
 	struct nfsvattr nfsva;
 	int error, attrflag, ret;
 	struct nfsmount *nmp;
 
 	nmp = VFSTONFS(vnode_mount(vp));
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp))
 		error = nfscl_doiods(vp, uiop, iomode, must_commit,
 		    NFSV4OPEN_ACCESSWRITE, 0, cred, uiop->uio_td);
 	NFSCL_DEBUG(4, "writerpc: aft doiods=%d\n", error);
 	if (error != 0)
 		error = nfsrpc_write(vp, uiop, iomode, must_commit, cred,
 		    uiop->uio_td, &nfsva, &attrflag, NULL,
 		    called_from_strategy);
 	if (attrflag) {
 		if (VTONFS(vp)->n_flag & ND_NFSV4)
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 1,
 			    1);
 		else
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (ret && !error)
 			error = ret;
 	}
 	if (DOINGASYNC(vp))
 		*iomode = NFSWRITE_FILESYNC;
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(uiop->uio_td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs mknod rpc
  * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the
  * mode set to specify the file type and the size field for rdev.
  */
 static int
 nfs_mknodrpc(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct vattr *vap)
 {
 	struct nfsvattr nfsva, dnfsva;
 	struct vnode *newvp = NULL;
 	struct nfsnode *np = NULL, *dnp;
 	struct nfsfh *nfhp;
 	struct vattr vattr;
 	int error = 0, attrflag, dattrflag;
 	u_int32_t rdev;
 
 	if (vap->va_type == VCHR || vap->va_type == VBLK)
 		rdev = vap->va_rdev;
 	else if (vap->va_type == VFIFO || vap->va_type == VSOCK)
 		rdev = 0xffffffff;
 	else
 		return (EOPNOTSUPP);
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	error = nfsrpc_mknod(dvp, cnp->cn_nameptr, cnp->cn_namelen, vap,
 	    rdev, vap->va_type, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (!nfhp)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag != 0) {
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		*vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	dnp = VTONFS(dvp);
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	NFSUNLOCKNODE(dnp);
 	return (error);
 }
 
 /*
  * nfs mknod vop
  * just call nfs_mknodrpc() to do the work.
  */
 /* ARGSUSED */
 static int
 nfs_mknod(struct vop_mknod_args *ap)
 {
 	return (nfs_mknodrpc(ap->a_dvp, ap->a_vpp, ap->a_cnp, ap->a_vap));
 }
 
 static struct mtx nfs_cverf_mtx;
 MTX_SYSINIT(nfs_cverf_mtx, &nfs_cverf_mtx, "NFS create verifier mutex",
     MTX_DEF);
 
 static nfsquad_t
 nfs_get_cverf(void)
 {
 	static nfsquad_t cverf;
 	nfsquad_t ret;
 	static int cverf_initialized = 0;
 
 	mtx_lock(&nfs_cverf_mtx);
 	if (cverf_initialized == 0) {
 		cverf.lval[0] = arc4random();
 		cverf.lval[1] = arc4random();
 		cverf_initialized = 1;
 	} else
 		cverf.qval++;
 	ret = cverf;
 	mtx_unlock(&nfs_cverf_mtx);
 
 	return (ret);
 }
 
 /*
  * nfs file create call
  */
 static int
 nfs_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct nfsmount *nmp;
 	struct nfsvattr dnfsva, nfsva;
 	struct nfsfh *nfhp;
 	nfsquad_t cverf;
 	int error = 0, attrflag, dattrflag, fmode = 0;
 	struct vattr vattr;
 
 	/*
 	 * Oops, not for me..
 	 */
 	if (vap->va_type == VSOCK)
 		return (nfs_mknodrpc(dvp, ap->a_vpp, cnp, vap));
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return (error);
 	if (vap->va_vaflags & VA_EXCLUSIVE)
 		fmode |= O_EXCL;
 	dnp = VTONFS(dvp);
 	nmp = VFSTONFS(vnode_mount(dvp));
 again:
 	/* For NFSv4, wait until any remove is done. */
 	NFSLOCKNODE(dnp);
 	while (NFSHASNFSV4(nmp) && (dnp->n_flag & NREMOVEINPROG)) {
 		dnp->n_flag |= NREMOVEWANT;
 		(void) msleep((caddr_t)dnp, &dnp->n_mtx, PZERO, "nfscrt", 0);
 	}
 	NFSUNLOCKNODE(dnp);
 
 	cverf = nfs_get_cverf();
 	error = nfsrpc_create(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cverf, fmode, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (!error) {
 		if (nfhp == NULL)
 			(void) nfsrpc_lookup(dvp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread,
 			    &dnfsva, &nfsva, &nfhp, &attrflag, &dattrflag,
 			    NULL);
 		if (nfhp != NULL)
 			error = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp,
 			    cnp->cn_thread, &np, NULL, LK_EXCLUSIVE);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (!error) {
 		newvp = NFSTOV(np);
 		if (attrflag == 0)
 			error = nfsrpc_getattr(newvp, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, NULL);
 		if (error == 0)
 			error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (error) {
 		if (newvp != NULL) {
 			vput(newvp);
 			newvp = NULL;
 		}
 		if (NFS_ISV34(dvp) && (fmode & O_EXCL) &&
 		    error == NFSERR_NOTSUPP) {
 			fmode &= ~O_EXCL;
 			goto again;
 		}
 	} else if (NFS_ISV34(dvp) && (fmode & O_EXCL)) {
 		if (nfscl_checksattr(vap, &nfsva)) {
 			error = nfsrpc_setattr(newvp, vap, NULL, cnp->cn_cred,
 			    cnp->cn_thread, &nfsva, &attrflag, NULL);
 			if (error && (vap->va_uid != (uid_t)VNOVAL ||
 			    vap->va_gid != (gid_t)VNOVAL)) {
 				/* try again without setting uid/gid */
 				vap->va_uid = (uid_t)VNOVAL;
 				vap->va_gid = (uid_t)VNOVAL;
 				error = nfsrpc_setattr(newvp, vap, NULL, 
 				    cnp->cn_cred, cnp->cn_thread, &nfsva,
 				    &attrflag, NULL);
 			}
 			if (attrflag)
 				(void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				    NULL, 0, 1);
 			if (error != 0)
 				vput(newvp);
 		}
 	}
 	if (!error) {
 		if ((cnp->cn_flags & MAKEENTRY) && attrflag)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    NULL);
 		*ap->a_vpp = newvp;
 	} else if (NFS_ISV4(dvp)) {
 		error = nfscl_maperr(cnp->cn_thread, error, vap->va_uid,
 		    vap->va_gid);
 	}
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	NFSUNLOCKNODE(dnp);
 	return (error);
 }
 
 /*
  * nfs file remove call
  * To try and make nfs semantics closer to ufs semantics, a file that has
  * other processes using the vnode is renamed instead of removed and then
  * removed later on the last close.
  * - If v_usecount > 1
  *	  If a rename is not already in the works
  *	     call nfs_sillyrename() to set it up
  *     else
  *	  do the remove rpc
  */
 static int
 nfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
 	struct vattr vattr;
 
 	KASSERT((cnp->cn_flags & HASBUF) != 0, ("nfs_remove: no name"));
 	KASSERT(vrefcnt(vp) > 0, ("nfs_remove: bad v_usecount"));
 	if (vp->v_type == VDIR)
 		error = EPERM;
 	else if (vrefcnt(vp) == 1 || (np->n_sillyrename &&
 	    VOP_GETATTR(vp, &vattr, cnp->cn_cred) == 0 &&
 	    vattr.va_nlink > 1)) {
 		/*
 		 * Purge the name cache so that the chance of a lookup for
 		 * the name succeeding while the remove is in progress is
 		 * minimized. Without node locking it can still happen, such
 		 * that an I/O op returns ESTALE, but since you get this if
 		 * another host removes the file..
 		 */
 		cache_purge(vp);
 		/*
 		 * throw away biocache buffers, mainly to avoid
 		 * unnecessary delayed writes later.
 		 */
 		error = ncl_vinvalbuf(vp, 0, cnp->cn_thread, 1);
 		if (error != EINTR && error != EIO)
 			/* Do the rpc */
 			error = nfs_removerpc(dvp, vp, cnp->cn_nameptr,
 			    cnp->cn_namelen, cnp->cn_cred, cnp->cn_thread);
 		/*
 		 * Kludge City: If the first reply to the remove rpc is lost..
 		 *   the reply to the retransmitted request will be ENOENT
 		 *   since the file was in fact removed
 		 *   Therefore, we cheat and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 	} else if (!np->n_sillyrename)
 		error = nfs_sillyrename(dvp, vp, cnp);
 	NFSLOCKNODE(np);
 	np->n_attrstamp = 0;
 	NFSUNLOCKNODE(np);
 	KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	return (error);
 }
 
 /*
  * nfs file remove rpc called from nfs_inactive
  */
 int
 ncl_removeit(struct sillyrename *sp, struct vnode *vp)
 {
 	/*
 	 * Make sure that the directory vnode is still valid.
 	 * XXX we should lock sp->s_dvp here.
 	 */
 	if (sp->s_dvp->v_type == VBAD)
 		return (0);
 	return (nfs_removerpc(sp->s_dvp, vp, sp->s_name, sp->s_namlen,
 	    sp->s_cred, NULL));
 }
 
 /*
  * Nfs remove rpc, called from nfs_remove() and ncl_removeit().
  */
 static int
 nfs_removerpc(struct vnode *dvp, struct vnode *vp, char *name,
     int namelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr dnfsva;
 	struct nfsnode *dnp = VTONFS(dvp);
 	int error = 0, dattrflag;
 
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NREMOVEINPROG;
 	NFSUNLOCKNODE(dnp);
 	error = nfsrpc_remove(dvp, name, namelen, vp, cred, td, &dnfsva,
 	    &dattrflag, NULL);
 	NFSLOCKNODE(dnp);
 	if ((dnp->n_flag & NREMOVEWANT)) {
 		dnp->n_flag &= ~(NREMOVEWANT | NREMOVEINPROG);
 		NFSUNLOCKNODE(dnp);
 		wakeup((caddr_t)dnp);
 	} else {
 		dnp->n_flag &= ~NREMOVEINPROG;
 		NFSUNLOCKNODE(dnp);
 	}
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (!dattrflag) {
 		dnp->n_attrstamp = 0;
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	NFSUNLOCKNODE(dnp);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs file rename call
  */
 static int
 nfs_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct nfsnode *fnp = VTONFS(ap->a_fvp);
 	struct nfsnode *tdnp = VTONFS(ap->a_tdvp);
 	struct nfsv4node *newv4 = NULL;
 	int error;
 
 	KASSERT((tcnp->cn_flags & HASBUF) != 0 &&
 	    (fcnp->cn_flags & HASBUF) != 0, ("nfs_rename: no name"));
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (fvp == tvp) {
 		printf("nfs_rename: fvp == tvp (can't happen)\n");
 		error = 0;
 		goto out;
 	}
 	if ((error = NFSVOPLOCK(fvp, LK_EXCLUSIVE)) != 0)
 		goto out;
 
 	/*
 	 * We have to flush B_DELWRI data prior to renaming
 	 * the file.  If we don't, the delayed-write buffers
 	 * can be flushed out later after the file has gone stale
 	 * under NFSV3.  NFSV2 does not have this problem because
 	 * ( as far as I can tell ) it flushes dirty buffers more
 	 * often.
 	 * 
 	 * Skip the rename operation if the fsync fails, this can happen
 	 * due to the server's volume being full, when we pushed out data
 	 * that was written back to our cache earlier. Not checking for
 	 * this condition can result in potential (silent) data loss.
 	 */
 	error = VOP_FSYNC(fvp, MNT_WAIT, fcnp->cn_thread);
 	NFSVOPUNLOCK(fvp, 0);
 	if (!error && tvp)
 		error = VOP_FSYNC(tvp, MNT_WAIT, tcnp->cn_thread);
 	if (error)
 		goto out;
 
 	/*
 	 * If the tvp exists and is in use, sillyrename it before doing the
 	 * rename of the new file over it.
 	 * XXX Can't sillyrename a directory.
 	 */
 	if (tvp && vrefcnt(tvp) > 1 && !VTONFS(tvp)->n_sillyrename &&
 		tvp->v_type != VDIR && !nfs_sillyrename(tdvp, tvp, tcnp)) {
 		vput(tvp);
 		tvp = NULL;
 	}
 
 	error = nfs_renamerpc(fdvp, fvp, fcnp->cn_nameptr, fcnp->cn_namelen,
 	    tdvp, tvp, tcnp->cn_nameptr, tcnp->cn_namelen, tcnp->cn_cred,
 	    tcnp->cn_thread);
 
 	if (error == 0 && NFS_ISV4(tdvp)) {
 		/*
 		 * For NFSv4, check to see if it is the same name and
 		 * replace the name, if it is different.
 		 */
 		newv4 = malloc(
 		    sizeof (struct nfsv4node) +
 		    tdnp->n_fhp->nfh_len + tcnp->cn_namelen - 1,
 		    M_NFSV4NODE, M_WAITOK);
 		NFSLOCKNODE(tdnp);
 		NFSLOCKNODE(fnp);
 		if (fnp->n_v4 != NULL && fvp->v_type == VREG &&
 		    (fnp->n_v4->n4_namelen != tcnp->cn_namelen ||
 		      NFSBCMP(tcnp->cn_nameptr, NFS4NODENAME(fnp->n_v4),
 		      tcnp->cn_namelen) ||
 		      tdnp->n_fhp->nfh_len != fnp->n_v4->n4_fhlen ||
 		      NFSBCMP(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			tdnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (tcnp->cn_namelen < 100) ? tcnp->cn_namelen : 99;
 bcopy(tcnp->cn_nameptr, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("ren replace=%s\n",nnn);
 }
 #endif
 			free(fnp->n_v4, M_NFSV4NODE);
 			fnp->n_v4 = newv4;
 			newv4 = NULL;
 			fnp->n_v4->n4_fhlen = tdnp->n_fhp->nfh_len;
 			fnp->n_v4->n4_namelen = tcnp->cn_namelen;
 			NFSBCOPY(tdnp->n_fhp->nfh_fh, fnp->n_v4->n4_data,
 			    tdnp->n_fhp->nfh_len);
 			NFSBCOPY(tcnp->cn_nameptr,
 			    NFS4NODENAME(fnp->n_v4), tcnp->cn_namelen);
 		}
 		NFSUNLOCKNODE(tdnp);
 		NFSUNLOCKNODE(fnp);
 		if (newv4 != NULL)
 			free(newv4, M_NFSV4NODE);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out:
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that it is a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs file rename rpc called from nfs_remove() above
  */
 static int
 nfs_renameit(struct vnode *sdvp, struct vnode *svp, struct componentname *scnp,
     struct sillyrename *sp)
 {
 
 	return (nfs_renamerpc(sdvp, svp, scnp->cn_nameptr, scnp->cn_namelen,
 	    sdvp, NULL, sp->s_name, sp->s_namlen, scnp->cn_cred,
 	    scnp->cn_thread));
 }
 
 /*
  * Do an nfs rename rpc. Called from nfs_rename() and nfs_renameit().
  */
 static int
 nfs_renamerpc(struct vnode *fdvp, struct vnode *fvp, char *fnameptr,
     int fnamelen, struct vnode *tdvp, struct vnode *tvp, char *tnameptr,
     int tnamelen, struct ucred *cred, struct thread *td)
 {
 	struct nfsvattr fnfsva, tnfsva;
 	struct nfsnode *fdnp = VTONFS(fdvp);
 	struct nfsnode *tdnp = VTONFS(tdvp);
 	int error = 0, fattrflag, tattrflag;
 
 	error = nfsrpc_rename(fdvp, fvp, fnameptr, fnamelen, tdvp, tvp,
 	    tnameptr, tnamelen, cred, td, &fnfsva, &tnfsva, &fattrflag,
 	    &tattrflag, NULL, NULL);
 	NFSLOCKNODE(fdnp);
 	fdnp->n_flag |= NMODIFIED;
 	if (fattrflag != 0) {
 		NFSUNLOCKNODE(fdnp);
 		(void) nfscl_loadattrcache(&fdvp, &fnfsva, NULL, NULL, 0, 1);
 	} else {
 		fdnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(fdnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(fdvp);
 	}
 	NFSLOCKNODE(tdnp);
 	tdnp->n_flag |= NMODIFIED;
 	if (tattrflag != 0) {
 		NFSUNLOCKNODE(tdnp);
 		(void) nfscl_loadattrcache(&tdvp, &tnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(tdnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (error && NFS_ISV4(fdvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs hard link create call
  */
 static int
 nfs_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np, *tdnp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag;
 
 	/*
 	 * Push all writes to the server, so that the attribute cache
 	 * doesn't get "out of sync" with the server.
 	 * XXX There should be a better way!
 	 */
 	VOP_FSYNC(vp, MNT_WAIT, cnp->cn_thread);
 
 	error = nfsrpc_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &attrflag,
 	    &dattrflag, NULL);
 	tdnp = VTONFS(tdvp);
 	NFSLOCKNODE(tdnp);
 	tdnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		NFSUNLOCKNODE(tdnp);
 		(void) nfscl_loadattrcache(&tdvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		tdnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(tdnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(tdvp);
 	}
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 	else {
 		np = VTONFS(vp);
 		NFSLOCKNODE(np);
 		np->n_attrstamp = 0;
 		NFSUNLOCKNODE(np);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(vp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(tdvp, vp, cnp, &nfsva.na_ctime, NULL);
 	}
 	if (error && NFS_ISV4(vp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs symbolic link create call
  */
 static int
 nfs_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsvattr nfsva, dnfsva;
 	struct nfsfh *nfhp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	int error = 0, attrflag, dattrflag, ret;
 
 	vap->va_type = VLNK;
 	error = nfsrpc_symlink(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    ap->a_target, vap, cnp->cn_cred, cnp->cn_thread, &dnfsva,
 	    &nfsva, &nfhp, &attrflag, &dattrflag, NULL);
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret)
 			newvp = NFSTOV(np);
 		else if (!error)
 			error = ret;
 	}
 	if (newvp != NULL) {
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	} else if (!error) {
 		/*
 		 * If we do not have an error and we could not extract the
 		 * newvp from the response due to the request being NFSv2, we
 		 * have to do a lookup in order to obtain a newvp to return.
 		 */
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error)
 			newvp = NFSTOV(np);
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		*ap->a_vpp = newvp;
 	}
 
 	dnp = VTONFS(dvp);
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		NFSUNLOCKNODE(dnp);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(dnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	/*
 	 * If negative lookup caching is enabled, I might as well
 	 * add an entry for this node. Not necessary for correctness,
 	 * but if negative caching is enabled, then the system
 	 * must care about lookup caching hit rate, so...
 	 */
 	if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 	    (cnp->cn_flags & MAKEENTRY) && attrflag != 0 && error == 0) {
 		cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime, NULL);
 	}
 	return (error);
 }
 
 /*
  * nfs make dir call
  */
 static int
 nfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *np = NULL, *dnp;
 	struct vnode *newvp = NULL;
 	struct vattr vattr;
 	struct nfsfh *nfhp;
 	struct nfsvattr nfsva, dnfsva;
 	int error = 0, attrflag, dattrflag, ret;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)) != 0)
 		return (error);
 	vap->va_type = VDIR;
 	error = nfsrpc_mkdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    vap, cnp->cn_cred, cnp->cn_thread, &dnfsva, &nfsva, &nfhp,
 	    &attrflag, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		NFSUNLOCKNODE(dnp);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(dnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 	if (nfhp) {
 		ret = nfscl_nget(dvp->v_mount, dvp, nfhp, cnp, cnp->cn_thread,
 		    &np, NULL, LK_EXCLUSIVE);
 		if (!ret) {
 			newvp = NFSTOV(np);
 			if (attrflag)
 			   (void) nfscl_loadattrcache(&newvp, &nfsva, NULL,
 				NULL, 0, 1);
 		} else if (!error)
 			error = ret;
 	}
 	if (!error && newvp == NULL) {
 		error = nfs_lookitup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 		    cnp->cn_cred, cnp->cn_thread, &np);
 		if (!error) {
 			newvp = NFSTOV(np);
 			if (newvp->v_type != VDIR)
 				error = EEXIST;
 		}
 	}
 	if (error) {
 		if (newvp)
 			vput(newvp);
 		if (NFS_ISV4(dvp))
 			error = nfscl_maperr(cnp->cn_thread, error,
 			    vap->va_uid, vap->va_gid);
 	} else {
 		/*
 		 * If negative lookup caching is enabled, I might as well
 		 * add an entry for this node. Not necessary for correctness,
 		 * but if negative caching is enabled, then the system
 		 * must care about lookup caching hit rate, so...
 		 */
 		if (VFSTONFS(dvp->v_mount)->nm_negnametimeo != 0 &&
 		    (cnp->cn_flags & MAKEENTRY) &&
 		    attrflag != 0 && dattrflag != 0)
 			cache_enter_time(dvp, newvp, cnp, &nfsva.na_ctime,
 			    &dnfsva.na_ctime);
 		*ap->a_vpp = newvp;
 	}
 	return (error);
 }
 
 /*
  * nfs remove directory call
  */
 static int
 nfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct nfsnode *dnp;
 	struct nfsvattr dnfsva;
 	int error, dattrflag;
 
 	if (dvp == vp)
 		return (EINVAL);
 	error = nfsrpc_rmdir(dvp, cnp->cn_nameptr, cnp->cn_namelen,
 	    cnp->cn_cred, cnp->cn_thread, &dnfsva, &dattrflag, NULL);
 	dnp = VTONFS(dvp);
 	NFSLOCKNODE(dnp);
 	dnp->n_flag |= NMODIFIED;
 	if (dattrflag != 0) {
 		NFSUNLOCKNODE(dnp);
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	} else {
 		dnp->n_attrstamp = 0;
 		NFSUNLOCKNODE(dnp);
 		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(dvp);
 	}
 
 	cache_purge(dvp);
 	cache_purge(vp);
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(cnp->cn_thread, error, (uid_t)0,
 		    (gid_t)0);
 	/*
 	 * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry.
 	 */
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 /*
  * nfs readdir call
  */
 static int
 nfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct uio *uio = ap->a_uio;
 	ssize_t tresid, left;
 	int error = 0;
 	struct vattr vattr;
 	
 	if (ap->a_eofflag != NULL)
 		*ap->a_eofflag = 0;
 	if (vp->v_type != VDIR) 
 		return(EPERM);
 
 	/*
 	 * First, check for hit on the EOF offset cache
 	 */
 	if (np->n_direofoffset > 0 && uio->uio_offset >= np->n_direofoffset &&
 	    (np->n_flag & NMODIFIED) == 0) {
 		if (VOP_GETATTR(vp, &vattr, ap->a_cred) == 0) {
 			NFSLOCKNODE(np);
 			if ((NFS_ISV4(vp) && np->n_change == vattr.va_filerev) ||
 			    !NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime)) {
 				NFSUNLOCKNODE(np);
 				NFSINCRGLOBAL(nfsstatsv1.direofcache_hits);
 				if (ap->a_eofflag != NULL)
 					*ap->a_eofflag = 1;
 				return (0);
 			} else
 				NFSUNLOCKNODE(np);
 		}
 	}
 
 	/*
 	 * NFS always guarantees that directory entries don't straddle
 	 * DIRBLKSIZ boundaries.  As such, we need to limit the size
 	 * to an exact multiple of DIRBLKSIZ, to avoid copying a partial
 	 * directory entry.
 	 */
 	left = uio->uio_resid % DIRBLKSIZ;
 	if (left == uio->uio_resid)
 		return (EINVAL);
 	uio->uio_resid -= left;
 
 	/*
 	 * Call ncl_bioread() to do the real work.
 	 */
 	tresid = uio->uio_resid;
 	error = ncl_bioread(vp, uio, 0, ap->a_cred);
 
 	if (!error && uio->uio_resid == tresid) {
 		NFSINCRGLOBAL(nfsstatsv1.direofcache_misses);
 		if (ap->a_eofflag != NULL)
 			*ap->a_eofflag = 1;
 	}
 	
 	/* Add the partial DIRBLKSIZ (left) back in. */
 	uio->uio_resid += left;
 	return (error);
 }
 
 /*
  * Readdir rpc call.
  * Called from below the buffer cache by ncl_doio().
  */
 int
 ncl_readdirrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, eof, attrflag;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);		
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 
 	error = nfsrpc_readdir(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at the end of the directory or have filled
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * NFS V3 readdir plus RPC. Used in place of ncl_readdirrpc().
  */
 int
 ncl_readdirplusrpc(struct vnode *vp, struct uio *uiop, struct ucred *cred,
     struct thread *td)
 {
 	struct nfsvattr nfsva;
 	nfsuint64 *cookiep, cookie;
 	struct nfsnode *dnp = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, attrflag, eof;
 
 	KASSERT(uiop->uio_iovcnt == 1 &&
 	    (uiop->uio_offset & (DIRBLKSIZ - 1)) == 0 &&
 	    (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0,
 	    ("nfs readdirplusrpc bad uio"));
 
 	/*
 	 * If there is no cookie, assume directory was stale.
 	 */
 	ncl_dircookie_lock(dnp);
 	cookiep = ncl_getcookie(dnp, uiop->uio_offset, 0);
 	if (cookiep) {
 		cookie = *cookiep;
 		ncl_dircookie_unlock(dnp);
 	} else {
 		ncl_dircookie_unlock(dnp);
 		return (NFSERR_BAD_COOKIE);
 	}
 
 	if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp))
 		(void)ncl_fsinfo(nmp, vp, cred, td);
 	error = nfsrpc_readdirplus(vp, uiop, &cookie, cred, td, &nfsva,
 	    &attrflag, &eof, NULL);
 	if (attrflag)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 
 	if (!error) {
 		/*
 		 * We are now either at end of the directory or have filled the
 		 * the block.
 		 */
 		if (eof)
 			dnp->n_direofoffset = uiop->uio_offset;
 		else {
 			if (uiop->uio_resid > 0)
 				printf("EEK! readdirplusrpc resid > 0\n");
 			ncl_dircookie_lock(dnp);
 			cookiep = ncl_getcookie(dnp, uiop->uio_offset, 1);
 			*cookiep = cookie;
 			ncl_dircookie_unlock(dnp);
 		}
 	} else if (NFS_ISV4(vp)) {
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	}
 	return (error);
 }
 
 /*
  * Silly rename. To make the NFS filesystem that is stateless look a little
  * more like the "ufs" a remove of an active vnode is translated to a rename
  * to a funny looking filename that is removed by nfs_inactive on the
  * nfsnode. There is the potential for another process on a different client
  * to create the same funny name between the nfs_lookitup() fails and the
  * nfs_rename() completes, but...
  */
 static int
 nfs_sillyrename(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 	struct sillyrename *sp;
 	struct nfsnode *np;
 	int error;
 	short pid;
 	unsigned int lticks;
 
 	cache_purge(dvp);
 	np = VTONFS(vp);
 	KASSERT(vp->v_type != VDIR, ("nfs: sillyrename dir"));
 	sp = malloc(sizeof (struct sillyrename),
 	    M_NEWNFSREQ, M_WAITOK);
 	sp->s_cred = crhold(cnp->cn_cred);
 	sp->s_dvp = dvp;
 	VREF(dvp);
 
 	/* 
 	 * Fudge together a funny name.
 	 * Changing the format of the funny name to accommodate more 
 	 * sillynames per directory.
 	 * The name is now changed to .nfs.<ticks>.<pid>.4, where ticks is 
 	 * CPU ticks since boot.
 	 */
 	pid = cnp->cn_thread->td_proc->p_pid;
 	lticks = (unsigned int)ticks;
 	for ( ; ; ) {
 		sp->s_namlen = sprintf(sp->s_name, 
 				       ".nfs.%08x.%04x4.4", lticks, 
 				       pid);
 		if (nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 				 cnp->cn_thread, NULL))
 			break;
 		lticks++;
 	}
 	error = nfs_renameit(dvp, vp, cnp, sp);
 	if (error)
 		goto bad;
 	error = nfs_lookitup(dvp, sp->s_name, sp->s_namlen, sp->s_cred,
 		cnp->cn_thread, &np);
 	np->n_sillyrename = sp;
 	return (0);
 bad:
 	vrele(sp->s_dvp);
 	crfree(sp->s_cred);
 	free(sp, M_NEWNFSREQ);
 	return (error);
 }
 
 /*
  * Look up a file name and optionally either update the file handle or
  * allocate an nfsnode, depending on the value of npp.
  * npp == NULL	--> just do the lookup
  * *npp == NULL --> allocate a new nfsnode and make sure attributes are
  *			handled too
  * *npp != NULL --> update the file handle in the vnode
  */
 static int
 nfs_lookitup(struct vnode *dvp, char *name, int len, struct ucred *cred,
     struct thread *td, struct nfsnode **npp)
 {
 	struct vnode *newvp = NULL, *vp;
 	struct nfsnode *np, *dnp = VTONFS(dvp);
 	struct nfsfh *nfhp, *onfhp;
 	struct nfsvattr nfsva, dnfsva;
 	struct componentname cn;
 	int error = 0, attrflag, dattrflag;
 	u_int hash;
 
 	error = nfsrpc_lookup(dvp, name, len, cred, td, &dnfsva, &nfsva,
 	    &nfhp, &attrflag, &dattrflag, NULL);
 	if (dattrflag)
 		(void) nfscl_loadattrcache(&dvp, &dnfsva, NULL, NULL, 0, 1);
 	if (npp && !error) {
 		if (*npp != NULL) {
 		    np = *npp;
 		    vp = NFSTOV(np);
 		    /*
 		     * For NFSv4, check to see if it is the same name and
 		     * replace the name, if it is different.
 		     */
 		    if (np->n_v4 != NULL && nfsva.na_type == VREG &&
 			(np->n_v4->n4_namelen != len ||
 			 NFSBCMP(name, NFS4NODENAME(np->n_v4), len) ||
 			 dnp->n_fhp->nfh_len != np->n_v4->n4_fhlen ||
 			 NFSBCMP(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 			 dnp->n_fhp->nfh_len))) {
 #ifdef notdef
 { char nnn[100]; int nnnl;
 nnnl = (len < 100) ? len : 99;
 bcopy(name, nnn, nnnl);
 nnn[nnnl] = '\0';
 printf("replace=%s\n",nnn);
 }
 #endif
 			    free(np->n_v4, M_NFSV4NODE);
 			    np->n_v4 = malloc(
 				sizeof (struct nfsv4node) +
 				dnp->n_fhp->nfh_len + len - 1,
 				M_NFSV4NODE, M_WAITOK);
 			    np->n_v4->n4_fhlen = dnp->n_fhp->nfh_len;
 			    np->n_v4->n4_namelen = len;
 			    NFSBCOPY(dnp->n_fhp->nfh_fh, np->n_v4->n4_data,
 				dnp->n_fhp->nfh_len);
 			    NFSBCOPY(name, NFS4NODENAME(np->n_v4), len);
 		    }
 		    hash = fnv_32_buf(nfhp->nfh_fh, nfhp->nfh_len,
 			FNV1_32_INIT);
 		    onfhp = np->n_fhp;
 		    /*
 		     * Rehash node for new file handle.
 		     */
 		    vfs_hash_rehash(vp, hash);
 		    np->n_fhp = nfhp;
 		    if (onfhp != NULL)
 			free(onfhp, M_NFSFH);
 		    newvp = NFSTOV(np);
 		} else if (NFS_CMPFH(dnp, nfhp->nfh_fh, nfhp->nfh_len)) {
 		    free(nfhp, M_NFSFH);
 		    VREF(dvp);
 		    newvp = dvp;
 		} else {
 		    cn.cn_nameptr = name;
 		    cn.cn_namelen = len;
 		    error = nfscl_nget(dvp->v_mount, dvp, nfhp, &cn, td,
 			&np, NULL, LK_EXCLUSIVE);
 		    if (error)
 			return (error);
 		    newvp = NFSTOV(np);
 		}
 		if (!attrflag && *npp == NULL) {
 			if (newvp == dvp)
 				vrele(newvp);
 			else
 				vput(newvp);
 			return (ENOENT);
 		}
 		if (attrflag)
 			(void) nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL,
 			    0, 1);
 	}
 	if (npp && *npp == NULL) {
 		if (error) {
 			if (newvp) {
 				if (newvp == dvp)
 					vrele(newvp);
 				else
 					vput(newvp);
 			}
 		} else
 			*npp = np;
 	}
 	if (error && NFS_ISV4(dvp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Nfs Version 3 and 4 commit rpc
  */
 int
 ncl_commit(struct vnode *vp, u_quad_t offset, int cnt, struct ucred *cred,
    struct thread *td)
 {
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct nfsnode *np;
 	struct uio uio;
 	int error, attrflag;
 
 	np = VTONFS(vp);
 	error = EIO;
 	attrflag = 0;
 	if (NFSHASPNFS(nmp) && (np->n_flag & NDSCOMMIT) != 0) {
 		uio.uio_offset = offset;
 		uio.uio_resid = cnt;
 		error = nfscl_doiods(vp, &uio, NULL, NULL,
 		    NFSV4OPEN_ACCESSWRITE, 1, cred, td);
 		if (error != 0) {
 			NFSLOCKNODE(np);
 			np->n_flag &= ~NDSCOMMIT;
 			NFSUNLOCKNODE(np);
 		}
 	}
 	if (error != 0) {
 		mtx_lock(&nmp->nm_mtx);
 		if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
 			mtx_unlock(&nmp->nm_mtx);
 			return (0);
 		}
 		mtx_unlock(&nmp->nm_mtx);
 		error = nfsrpc_commit(vp, offset, cnt, cred, td, &nfsva,
 		    &attrflag, NULL);
 	}
 	if (attrflag != 0)
 		(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL,
 		    0, 1);
 	if (error != 0 && NFS_ISV4(vp))
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * Strategy routine.
  * For async requests when nfsiod(s) are running, queue the request by
  * calling ncl_asyncio(), otherwise just all ncl_doio() to do the
  * request.
  */
 static int
 nfs_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct ucred *cr;
 
 	bp = ap->a_bp;
 	vp = ap->a_vp;
 	KASSERT(bp->b_vp == vp, ("missing b_getvp"));
 	KASSERT(!(bp->b_flags & B_DONE),
 	    ("nfs_strategy: buffer %p unexpectedly marked B_DONE", bp));
 
 	if (vp->v_type == VREG && bp->b_blkno == bp->b_lblkno)
 		bp->b_blkno = bp->b_lblkno * (vp->v_bufobj.bo_bsize /
 		    DEV_BSIZE);
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
 	if ((bp->b_flags & B_ASYNC) == 0 ||
 	    ncl_asyncio(VFSTONFS(vp->v_mount), bp, NOCRED, curthread))
 		(void) ncl_doio(vp, bp, cr, curthread, 1);
 	return (0);
 }
 
 /*
  * fsync vnode op. Just call ncl_flush() with commit == 1.
  */
 /* ARGSUSED */
 static int
 nfs_fsync(struct vop_fsync_args *ap)
 {
 
 	if (ap->a_vp->v_type != VREG) {
 		/*
 		 * For NFS, metadata is changed synchronously on the server,
 		 * so there is nothing to flush. Also, ncl_flush() clears
 		 * the NMODIFIED flag and that shouldn't be done here for
 		 * directories.
 		 */
 		return (0);
 	}
 	return (ncl_flush(ap->a_vp, ap->a_waitfor, ap->a_td, 1, 0));
 }
 
 /*
  * Flush all the blocks associated with a vnode.
  * 	Walk through the buffer pool and push any dirty pages
  *	associated with the vnode.
  * If the called_from_renewthread argument is TRUE, it has been called
  * from the NFSv4 renew thread and, as such, cannot block indefinitely
  * waiting for a buffer write to complete.
  */
 int
 ncl_flush(struct vnode *vp, int waitfor, struct thread *td,
     int commit, int called_from_renewthread)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct buf *bp;
 	int i;
 	struct buf *nbp;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
 	int passone = 1, trycnt = 0;
 	u_quad_t off, endoff, toff;
 	struct ucred* wcred = NULL;
 	struct buf **bvec = NULL;
 	struct bufobj *bo;
 #ifndef NFS_COMMITBVECSIZ
 #define	NFS_COMMITBVECSIZ	20
 #endif
 	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
 	u_int bvecsize = 0, bveccount;
 
 	if (called_from_renewthread != 0)
 		slptimeo = hz;
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	if (!commit)
 		passone = 0;
 	bo = &vp->v_bufobj;
 	/*
 	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
 	 * server, but has not been committed to stable storage on the server
 	 * yet. On the first pass, the byte range is worked out and the commit
 	 * rpc is done. On the second pass, ncl_writebp() is called to do the
 	 * job.
 	 */
 again:
 	off = (u_quad_t)-1;
 	endoff = 0;
 	bvecpos = 0;
 	if (NFS_ISV34(vp) && commit) {
 		if (bvec != NULL && bvec != bvec_on_stack)
 			free(bvec, M_TEMP);
 		/*
 		 * Count up how many buffers waiting for a commit.
 		 */
 		bveccount = 0;
 		BO_LOCK(bo);
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (!BUF_ISLOCKED(bp) &&
 			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bveccount++;
 		}
 		/*
 		 * Allocate space to remember the list of bufs to commit.  It is
 		 * important to use M_NOWAIT here to avoid a race with nfs_write.
 		 * If we can't get memory (for whatever reason), we will end up
 		 * committing the buffers one-by-one in the loop below.
 		 */
 		if (bveccount > NFS_COMMITBVECSIZ) {
 			/*
 			 * Release the vnode interlock to avoid a lock
 			 * order reversal.
 			 */
 			BO_UNLOCK(bo);
 			bvec = (struct buf **)
 				malloc(bveccount * sizeof(struct buf *),
 				       M_TEMP, M_NOWAIT);
 			BO_LOCK(bo);
 			if (bvec == NULL) {
 				bvec = bvec_on_stack;
 				bvecsize = NFS_COMMITBVECSIZ;
 			} else
 				bvecsize = bveccount;
 		} else {
 			bvec = bvec_on_stack;
 			bvecsize = NFS_COMMITBVECSIZ;
 		}
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bvecpos >= bvecsize)
 				break;
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
 			    (B_DELWRI | B_NEEDCOMMIT)) {
 				BUF_UNLOCK(bp);
 				nbp = TAILQ_NEXT(bp, b_bobufs);
 				continue;
 			}
 			BO_UNLOCK(bo);
 			bremfree(bp);
 			/*
 			 * Work out if all buffers are using the same cred
 			 * so we can deal with them all with one commit.
 			 *
 			 * NOTE: we are not clearing B_DONE here, so we have
 			 * to do it later on in this routine if we intend to
 			 * initiate I/O on the bp.
 			 *
 			 * Note: to avoid loopback deadlocks, we do not
 			 * assign b_runningbufspace.
 			 */
 			if (wcred == NULL)
 				wcred = bp->b_wcred;
 			else if (wcred != bp->b_wcred)
 				wcred = NOCRED;
 			vfs_busy_pages(bp, 1);
 
 			BO_LOCK(bo);
 			/*
 			 * bp is protected by being locked, but nbp is not
 			 * and vfs_busy_pages() may sleep.  We have to
 			 * recalculate nbp.
 			 */
 			nbp = TAILQ_NEXT(bp, b_bobufs);
 
 			/*
 			 * A list of these buffers is kept so that the
 			 * second loop knows which buffers have actually
 			 * been committed. This is necessary, since there
 			 * may be a race between the commit rpc and new
 			 * uncommitted writes on the file.
 			 */
 			bvec[bvecpos++] = bp;
 			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 				bp->b_dirtyoff;
 			if (toff < off)
 				off = toff;
 			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
 			if (toff > endoff)
 				endoff = toff;
 		}
 		BO_UNLOCK(bo);
 	}
 	if (bvecpos > 0) {
 		/*
 		 * Commit data on the server, as required.
 		 * If all bufs are using the same wcred, then use that with
 		 * one call for all of them, otherwise commit each one
 		 * separately.
 		 */
 		if (wcred != NOCRED)
 			retv = ncl_commit(vp, off, (int)(endoff - off),
 					  wcred, td);
 		else {
 			retv = 0;
 			for (i = 0; i < bvecpos; i++) {
 				off_t off, size;
 				bp = bvec[i];
 				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
 					bp->b_dirtyoff;
 				size = (u_quad_t)(bp->b_dirtyend
 						  - bp->b_dirtyoff);
 				retv = ncl_commit(vp, off, (int)size,
 						  bp->b_wcred, td);
 				if (retv) break;
 			}
 		}
 
 		if (retv == NFSERR_STALEWRITEVERF)
 			ncl_clearcommit(vp->v_mount);
 
 		/*
 		 * Now, either mark the blocks I/O done or mark the
 		 * blocks dirty, depending on whether the commit
 		 * succeeded.
 		 */
 		for (i = 0; i < bvecpos; i++) {
 			bp = bvec[i];
 			bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			if (retv) {
 				/*
 				 * Error, leave B_DELWRI intact
 				 */
 				vfs_unbusy_pages(bp);
 				brelse(bp);
 			} else {
 				/*
 				 * Success, remove B_DELWRI ( bundirty() ).
 				 *
 				 * b_dirtyoff/b_dirtyend seem to be NFS
 				 * specific.  We should probably move that
 				 * into bundirty(). XXX
 				 */
 				bufobj_wref(bo);
 				bp->b_flags |= B_ASYNC;
 				bundirty(bp);
 				bp->b_flags &= ~B_DONE;
 				bp->b_ioflags &= ~BIO_ERROR;
 				bp->b_dirtyoff = bp->b_dirtyend = 0;
 				bufdone(bp);
 			}
 		}
 	}
 
 	/*
 	 * Start/do any write(s) that are required.
 	 */
 loop:
 	BO_LOCK(bo);
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT || passone)
 				continue;
 
 			error = BUF_TIMELOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo), "nfsfsync", slpflag, slptimeo);
 			if (error == 0) {
 				BUF_UNLOCK(bp);
 				goto loop;
 			}
 			if (error == ENOLCK) {
 				error = 0;
 				goto loop;
 			}
 			if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so the flush will be retried
 				 * later.
 				 */
 				error = EIO;
 				goto done;
 			}
 			if (newnfs_sigintr(nmp, td)) {
 				error = EINTR;
 				goto done;
 			}
 			if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			}
 			goto loop;
 		}
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("nfs_fsync: not dirty");
 		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bo);
 		bremfree(bp);
 		bp->b_flags |= B_ASYNC;
 		bwrite(bp);
 		if (newnfs_sigintr(nmp, td)) {
 			error = EINTR;
 			goto done;
 		}
 		goto loop;
 	}
 	if (passone) {
 		passone = 0;
 		BO_UNLOCK(bo);
 		goto again;
 	}
 	if (waitfor == MNT_WAIT) {
 		while (bo->bo_numoutput) {
 			error = bufobj_wwait(bo, slpflag, slptimeo);
 			if (error) {
 			    BO_UNLOCK(bo);
 			    if (called_from_renewthread != 0) {
 				/*
 				 * Return EIO so that the flush will be
 				 * retried later.
 				 */
 				error = EIO;
 				goto done;
 			    }
 			    error = newnfs_sigintr(nmp, td);
 			    if (error)
 				goto done;
 			    if (slpflag == PCATCH) {
 				slpflag = 0;
 				slptimeo = 2 * hz;
 			    }
 			    BO_LOCK(bo);
 			}
 		}
 		if (bo->bo_dirty.bv_cnt != 0 && commit) {
 			BO_UNLOCK(bo);
 			goto loop;
 		}
 		/*
 		 * Wait for all the async IO requests to drain
 		 */
 		BO_UNLOCK(bo);
 		NFSLOCKNODE(np);
 		while (np->n_directio_asyncwr > 0) {
 			np->n_flag |= NFSYNCWAIT;
 			error = newnfs_msleep(td, &np->n_directio_asyncwr,
 			    &np->n_mtx, slpflag | (PRIBIO + 1), 
 			    "nfsfsync", 0);
 			if (error) {
 				if (newnfs_sigintr(nmp, td)) {
 					NFSUNLOCKNODE(np);
 					error = EINTR;	
 					goto done;
 				}
 			}
 		}
 		NFSUNLOCKNODE(np);
 	} else
 		BO_UNLOCK(bo);
 	if (NFSHASPNFS(nmp)) {
 		nfscl_layoutcommit(vp, td);
 		/*
 		 * Invalidate the attribute cache, since writes to a DS
 		 * won't update the size attribute.
 		 */
 		NFSLOCKNODE(np);
 		np->n_attrstamp = 0;
 	} else
 		NFSLOCKNODE(np);
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
   	if (commit && bo->bo_dirty.bv_cnt == 0 &&
 	    bo->bo_numoutput == 0 && np->n_directio_asyncwr == 0)
   		np->n_flag &= ~NMODIFIED;
 	NFSUNLOCKNODE(np);
 done:
 	if (bvec != NULL && bvec != bvec_on_stack)
 		free(bvec, M_TEMP);
 	if (error == 0 && commit != 0 && waitfor == MNT_WAIT &&
 	    (bo->bo_dirty.bv_cnt != 0 || bo->bo_numoutput != 0 ||
 	    np->n_directio_asyncwr != 0)) {
 		if (trycnt++ < 5) {
 			/* try, try again... */
 			passone = 1;
 			wcred = NULL;
 			bvec = NULL;
 			bvecsize = 0;
 			goto again;
 		}
 		vn_printf(vp, "ncl_flush failed");
 		error = called_from_renewthread != 0 ? EIO : EBUSY;
 	}
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred;
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	struct proc *p = (struct proc *)ap->a_id;
 	struct thread *td = curthread;	/* XXX */
 	struct vattr va;
 	int ret, error;
 	u_quad_t size;
 	
 	error = NFSVOPLOCK(vp, LK_SHARED);
 	if (error != 0)
 		return (EBADF);
 	if (NFS_ISV4(vp) && (ap->a_flags & (F_POSIX | F_FLOCK)) != 0) {
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((ap->a_flags & F_POSIX) != 0)
 			cred = p->p_ucred;
 		else
 			cred = td->td_ucred;
 		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(vp)) {
 			error = EBADF;
 			goto out;
 		}
 
 		/*
 		 * If this is unlocking a write locked region, flush and
 		 * commit them before unlocking. This is required by
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_UNLCK &&
 		    nfscl_checkwritelocked(vp, ap->a_fl, cred, td, ap->a_id,
 		    ap->a_flags))
 			(void) ncl_flush(vp, MNT_WAIT, td, 1, 0);
 
 		/*
 		 * Loop around doing the lock op, while a blocking lock
 		 * must wait for the lock op to succeed.
 		 */
 		do {
 			ret = nfsrpc_advlock(vp, np->n_size, ap->a_op,
 			    ap->a_fl, 0, cred, td, ap->a_id, ap->a_flags);
 			if (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 			    ap->a_op == F_SETLK) {
 				NFSVOPUNLOCK(vp, 0);
 				error = nfs_catnap(PZERO | PCATCH, ret,
 				    "ncladvl");
 				if (error)
 					return (EINTR);
 				NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
 				if (VN_IS_DOOMED(vp)) {
 					error = EBADF;
 					goto out;
 				}
 			}
 		} while (ret == NFSERR_DENIED && (ap->a_flags & F_WAIT) &&
 		     ap->a_op == F_SETLK);
 		if (ret == NFSERR_DENIED) {
 			error = EAGAIN;
 			goto out;
 		} else if (ret == EINVAL || ret == EBADF || ret == EINTR) {
 			error = ret;
 			goto out;
 		} else if (ret != 0) {
 			error = EACCES;
 			goto out;
 		}
 
 		/*
 		 * Now, if we just got a lock, invalidate data in the buffer
 		 * cache, as required, so that the coherency conforms with
 		 * RFC3530 Sec. 9.3.2.
 		 */
 		if (ap->a_op == F_SETLK) {
 			if ((np->n_flag & NMODIFIED) == 0) {
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 			}
 			if ((np->n_flag & NMODIFIED) || ret ||
 			    np->n_change != va.va_filerev) {
 				(void) ncl_vinvalbuf(vp, V_SAVE, td, 1);
 				np->n_attrstamp = 0;
 				KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
 				ret = VOP_GETATTR(vp, &va, cred);
 				if (!ret) {
 					np->n_mtime = va.va_mtime;
 					np->n_change = va.va_filerev;
 				}
 			}
 			/* Mark that a file lock has been acquired. */
 			NFSLOCKNODE(np);
 			np->n_flag |= NHASBEENLOCKED;
 			NFSUNLOCKNODE(np);
 		}
 	} else if (!NFS_ISV4(vp)) {
 		if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 			size = VTONFS(vp)->n_size;
 			NFSVOPUNLOCK(vp, 0);
 			error = lf_advlock(ap, &(vp->v_lockf), size);
 		} else {
 			if (nfs_advlock_p != NULL)
 				error = nfs_advlock_p(ap);
 			else {
 				NFSVOPUNLOCK(vp, 0);
 				error = ENOLCK;
 			}
 		}
 		if (error == 0 && ap->a_op == F_SETLK) {
 			error = NFSVOPLOCK(vp, LK_SHARED);
 			if (error == 0) {
 				/* Mark that a file lock has been acquired. */
 				NFSLOCKNODE(np);
 				np->n_flag |= NHASBEENLOCKED;
 				NFSUNLOCKNODE(np);
 				NFSVOPUNLOCK(vp, 0);
 			}
 		}
 		return (error);
 	} else
 		error = EOPNOTSUPP;
 out:
 	NFSVOPUNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * NFS advisory byte-level locks.
  */
 static int
 nfs_advlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	u_quad_t size;
 	int error;
 	
 	if (NFS_ISV4(vp))
 		return (EOPNOTSUPP);
 	error = NFSVOPLOCK(vp, LK_SHARED);
 	if (error)
 		return (error);
 	if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NOLOCKD) != 0) {
 		size = VTONFS(vp)->n_size;
 		NFSVOPUNLOCK(vp, 0);
 		error = lf_advlockasync(ap, &(vp->v_lockf), size);
 	} else {
 		NFSVOPUNLOCK(vp, 0);
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 
 /*
  * Print out the contents of an nfsnode.
  */
 static int
 nfs_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 
 	printf("\tfileid %jd fsid 0x%jx", (uintmax_t)np->n_vattr.na_fileid,
 	    (uintmax_t)np->n_vattr.na_fsid);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * This is the "real" nfs::bwrite(struct buf*).
  * We set B_CACHE if this is a VMIO buffer.
  */
 int
 ncl_writebp(struct buf *bp, int force __unused, struct thread *td)
 {
 	int oldflags, rtval;
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * Undirty the bp.  We will redirty it later if the I/O fails.
 	 */
 	bundirty(bp);
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	curthread->td_ru.ru_oublock++;
 
 	/*
 	 * Note: to avoid loopback deadlocks, we do not
 	 * assign b_runningbufspace.
 	 */
 	vfs_busy_pages(bp, 1);
 
 	BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) != 0)
 		return (0);
 
 	rtval = bufwait(bp);
 	if (oldflags & B_DELWRI)
 		reassignbuf(bp);
 	brelse(bp);
 	return (rtval);
 }
 
 /*
  * nfs special file access vnode op.
  * Essentially just get vattr and then imitate iaccess() since the device is
  * local to the client.
  */
 static int
 nfsspec_access(struct vop_access_args *ap)
 {
 	struct vattr *vap;
 	struct ucred *cred = ap->a_cred;
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	struct vattr vattr;
 	int error;
 
 	/*
 	 * Disallow write attempts on filesystems mounted read-only;
 	 * unless the file is a socket, fifo, or a block or character
 	 * device resident on the filesystem.
 	 */
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, cred);
 	if (error)
 		goto out;
 	error  = vaccess(vp->v_type, vap->va_mode, vap->va_uid, vap->va_gid,
 	    accmode, cred, NULL);
 out:
 	return error;
 }
 
 /*
  * Read wrapper for fifos.
  */
 static int
 nfsfifo_read(struct vop_read_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 	int error;
 
 	/*
 	 * Set access flag.
 	 */
 	NFSLOCKNODE(np);
 	np->n_flag |= NACC;
 	vfs_timestamp(&np->n_atim);
 	NFSUNLOCKNODE(np);
 	error = fifo_specops.vop_read(ap);
 	return error;	
 }
 
 /*
  * Write wrapper for fifos.
  */
 static int
 nfsfifo_write(struct vop_write_args *ap)
 {
 	struct nfsnode *np = VTONFS(ap->a_vp);
 
 	/*
 	 * Set update flag.
 	 */
 	NFSLOCKNODE(np);
 	np->n_flag |= NUPD;
 	vfs_timestamp(&np->n_mtim);
 	NFSUNLOCKNODE(np);
 	return(fifo_specops.vop_write(ap));
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the nfsnode then do fifo close.
  */
 static int
 nfsfifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct vattr vattr;
 	struct timespec ts;
 
 	NFSLOCKNODE(np);
 	if (np->n_flag & (NACC | NUPD)) {
 		vfs_timestamp(&ts);
 		if (np->n_flag & NACC)
 			np->n_atim = ts;
 		if (np->n_flag & NUPD)
 			np->n_mtim = ts;
 		np->n_flag |= NCHG;
 		if (vrefcnt(vp) == 1 &&
 		    (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			VATTR_NULL(&vattr);
 			if (np->n_flag & NACC)
 				vattr.va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vattr.va_mtime = np->n_mtim;
 			NFSUNLOCKNODE(np);
 			(void)VOP_SETATTR(vp, &vattr, ap->a_cred);
 			goto out;
 		}
 	}
 	NFSUNLOCKNODE(np);
 out:
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Just call ncl_writebp() with the force argument set to 1.
  *
  * NOTE: B_DONE may or may not be set in a_bp on call.
  */
 static int
 nfs_bwrite(struct buf *bp)
 {
 
 	return (ncl_writebp(bp, 1, curthread));
 }
 
 struct buf_ops buf_ops_newnfs = {
 	.bop_name	=	"buf_ops_nfs",
 	.bop_write	=	nfs_bwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static int
 nfs_getacl(struct vop_getacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_getacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 static int
 nfs_setacl(struct vop_setacl_args *ap)
 {
 	int error;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EOPNOTSUPP);
 	error = nfsrpc_setacl(ap->a_vp, ap->a_cred, ap->a_td, ap->a_aclp,
 	    NULL);
 	if (error > NFSERR_STALE) {
 		(void) nfscl_maperr(ap->a_td, error, (uid_t)0, (gid_t)0);
 		error = EPERM;
 	}
 	return (error);
 }
 
 /*
  * VOP_ADVISE for NFS.
  * Just return 0 for any errors, since it is just a hint.
  */
 static int
 nfs_advise(struct vop_advise_args *ap)
 {
 	struct thread *td = curthread;
 	struct nfsmount *nmp;
 	uint64_t len;
 	int error;
 
 	/*
 	 * First do vop_stdadvise() to handle the buffer cache.
 	 */
 	error = vop_stdadvise(ap);
 	if (error != 0)
 		return (error);
 	if (ap->a_start < 0 || ap->a_end < 0)
 		return (0);
 	if (ap->a_end == OFF_MAX)
 		len = 0;
 	else if (ap->a_end < ap->a_start)
 		return (0);
 	else
 		len = ap->a_end - ap->a_start + 1;
 	nmp = VFSTONFS(ap->a_vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (NFSHASPNFS(nmp) && (nmp->nm_privflag & NFSMNTP_IOADVISETHRUMDS) ==
 	    0) || (nmp->nm_privflag & NFSMNTP_NOADVISE) != 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (0);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 	error = nfsrpc_advise(ap->a_vp, ap->a_start, len, ap->a_advice,
 	    td->td_ucred, td);
 	if (error == NFSERR_NOTSUPP) {
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_NOADVISE;
 		mtx_unlock(&nmp->nm_mtx);
 	}
 	return (0);
 }
 
 /*
  * nfs allocate call
  */
 static int
 nfs_allocate(struct vop_allocate_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = curthread;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 	int attrflag, error, ret;
 
 	attrflag = 0;
 	nmp = VFSTONFS(vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (NFSHASNFSV4(nmp) && nmp->nm_minorvers >= NFSV42_MINORVERSION &&
 	    (nmp->nm_privflag & NFSMNTP_NOALLOCATE) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		/*
 		 * Flush first to ensure that the allocate adds to the
 		 * file's allocation on the server.
 		 */
 		error = ncl_flush(vp, MNT_WAIT, td, 1, 0);
 		if (error == 0)
 			error = nfsrpc_allocate(vp, *ap->a_offset, *ap->a_len,
 			    &nfsva, &attrflag, td->td_ucred, td, NULL);
 		if (error == 0) {
 			*ap->a_offset += *ap->a_len;
 			*ap->a_len = 0;
 		} else if (error == NFSERR_NOTSUPP) {
 			mtx_lock(&nmp->nm_mtx);
 			nmp->nm_privflag |= NFSMNTP_NOALLOCATE;
 			mtx_unlock(&nmp->nm_mtx);
 		}
 	} else {
 		mtx_unlock(&nmp->nm_mtx);
 		error = EIO;
 	}
 	/*
 	 * If the NFS server cannot perform the Allocate operation, just call
 	 * vop_stdallocate() to perform it.
 	 */
 	if (error != 0)
 		error = vop_stdallocate(ap);
 	if (attrflag != 0) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (error == 0 && ret != 0)
 			error = ret;
 	}
 	if (error != 0)
 		error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs copy_file_range call
  */
 static int
 nfs_copy_file_range(struct vop_copy_file_range_args *ap)
 {
 	struct vnode *invp = ap->a_invp;
 	struct vnode *outvp = ap->a_outvp;
 	struct mount *mp;
 	struct nfsvattr innfsva, outnfsva;
 	struct vattr *vap;
 	struct uio io;
 	struct nfsmount *nmp;
 	size_t len, len2, copiedlen;
 	int error, inattrflag, outattrflag, ret, ret2;
 	off_t inoff, outoff;
 	bool consecutive, must_commit, tryoutcred;
 
 	ret = ret2 = 0;
 	nmp = VFSTONFS(invp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	/* NFSv4.2 Copy is not permitted for infile == outfile. */
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (nmp->nm_privflag & NFSMNTP_NOCOPY) != 0 || invp == outvp) {
 		mtx_unlock(&nmp->nm_mtx);
 		error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
 		    ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags,
 		    ap->a_incred, ap->a_outcred, ap->a_fsizetd);
 		return (error);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	/* Lock both vnodes, avoiding risk of deadlock. */
 	do {
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
 		if (error == 0) {
 			error = vn_lock(outvp, LK_EXCLUSIVE);
 			if (error == 0) {
 				error = vn_lock(invp, LK_SHARED | LK_NOWAIT);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(outvp, 0);
 				if (mp != NULL)
 					vn_finished_write(mp);
 				mp = NULL;
 				error = vn_lock(invp, LK_SHARED);
 				if (error == 0)
 					VOP_UNLOCK(invp, 0);
 			}
 		}
 		if (mp != NULL)
 			vn_finished_write(mp);
 	} while (error == 0);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Do the vn_rlimit_fsize() check.  Should this be above the VOP layer?
 	 */
 	io.uio_offset = *ap->a_outoffp;
 	io.uio_resid = *ap->a_lenp;
 	error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
 
 	/*
 	 * Flush the input file so that the data is up to date before
 	 * the copy.  Flush writes for the output file so that they
 	 * do not overwrite the data copied to the output file by the Copy.
 	 * Set the commit argument for both flushes so that the data is on
 	 * stable storage before the Copy RPC.  This is done in case the
 	 * server reboots during the Copy and needs to be redone.
 	 */
 	if (error == 0)
 		error = ncl_flush(invp, MNT_WAIT, curthread, 1, 0);
 	if (error == 0)
 		error = ncl_flush(outvp, MNT_WAIT, curthread, 1, 0);
 
 	/* Do the actual NFSv4.2 RPC. */
 	len = *ap->a_lenp;
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_privflag & NFSMNTP_NOCONSECUTIVE) == 0)
 		consecutive = true;
 	else
 		consecutive = false;
 	mtx_unlock(&nmp->nm_mtx);
 	inoff = *ap->a_inoffp;
 	outoff = *ap->a_outoffp;
 	tryoutcred = true;
 	must_commit = false;
 	if (error == 0) {
 		vap = &VTONFS(invp)->n_vattr.na_vattr;
 		error = VOP_GETATTR(invp, vap, ap->a_incred);
 		if (error == 0) {
 			/*
 			 * Clip "len" at va_size so that RFC compliant servers
 			 * will not reply NFSERR_INVAL.
 			 * Setting "len == 0" for the RPC would be preferred,
 			 * but some Linux servers do not support that.
 			 */
 			if (inoff >= vap->va_size)
 				*ap->a_lenp = len = 0;
 			else if (inoff + len > vap->va_size)
 				*ap->a_lenp = len = vap->va_size - inoff;
 		} else
 			error = 0;
 	}
 	copiedlen = 0;
 	while (len > 0 && error == 0) {
 		inattrflag = outattrflag = 0;
 		len2 = len;
 		if (tryoutcred)
 			error = nfsrpc_copy_file_range(invp, ap->a_inoffp,
 			    outvp, ap->a_outoffp, &len2, ap->a_flags,
 			    &inattrflag, &innfsva, &outattrflag, &outnfsva,
 			    ap->a_outcred, consecutive, &must_commit);
 		else
 			error = nfsrpc_copy_file_range(invp, ap->a_inoffp,
 			    outvp, ap->a_outoffp, &len2, ap->a_flags,
 			    &inattrflag, &innfsva, &outattrflag, &outnfsva,
 			    ap->a_incred, consecutive, &must_commit);
 		if (inattrflag != 0)
 			ret = nfscl_loadattrcache(&invp, &innfsva, NULL, NULL,
 			    0, 1);
 		if (outattrflag != 0)
 			ret2 = nfscl_loadattrcache(&outvp, &outnfsva, NULL,
 			    NULL, 1, 1);
 		if (error == 0) {
 			if (consecutive == false) {
 				if (len2 == len) {
 					mtx_lock(&nmp->nm_mtx);
 					nmp->nm_privflag |=
 					    NFSMNTP_NOCONSECUTIVE;
 					mtx_unlock(&nmp->nm_mtx);
 				} else
 					error = NFSERR_OFFLOADNOREQS;
 			}
 			/*
 			 * If the Copy returns a length == 0, it hit the
 			 * EOF on the input file.
 			 */
 			if (len2 == 0) {
 				*ap->a_lenp = copiedlen;
 				len = 0;
 			} else {
 				len -= len2;
 				copiedlen += len2;
 			}
 			if (len == 0 && must_commit && error == 0)
 				error = ncl_commit(outvp, outoff, *ap->a_lenp,
 				    ap->a_outcred, curthread);
 			if (error == 0 && ret != 0)
 				error = ret;
 			if (error == 0 && ret2 != 0)
 				error = ret2;
 		} else if (error == NFSERR_OFFLOADNOREQS && consecutive) {
 			/*
 			 * Try consecutive == false, which is ok only if all
 			 * bytes are copied.
 			 */
 			consecutive = false;
 			error = 0;
 		} else if (error == NFSERR_ACCES && tryoutcred) {
 			/* Try again with incred. */
 			tryoutcred = false;
 			error = 0;
 		}
 		if (error == NFSERR_STALEWRITEVERF) {
 			/*
 			 * Server rebooted, so do it all again.
 			 */
 			*ap->a_inoffp = inoff;
 			*ap->a_outoffp = outoff;
 			len = *ap->a_lenp;
 			must_commit = false;
 			error = 0;
 		}
 	}
 	VOP_UNLOCK(invp, 0);
 	VOP_UNLOCK(outvp, 0);
 	if (mp != NULL)
 		vn_finished_write(mp);
 	if (error == NFSERR_NOTSUPP || error == NFSERR_OFFLOADNOREQS ||
 	    error == NFSERR_ACCES) {
 		/*
 		 * Unlike the NFSv4.2 Copy, vn_generic_copy_file_range() can
 		 * use a_incred for the read and a_outcred for the write, so
 		 * try this for NFSERR_ACCES failures for the Copy.
 		 * For NFSERR_NOTSUPP and NFSERR_OFFLOADNOREQS, the Copy can
 		 * never succeed, so disable it.
 		 */
 		if (error != NFSERR_ACCES) {
 			/* Can never do Copy on this mount. */
 			mtx_lock(&nmp->nm_mtx);
 			nmp->nm_privflag |= NFSMNTP_NOCOPY;
 			mtx_unlock(&nmp->nm_mtx);
 		}
 		*ap->a_inoffp = inoff;
 		*ap->a_outoffp = outoff;
 		error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
 		    ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags,
 		    ap->a_incred, ap->a_outcred, ap->a_fsizetd);
 	} else if (error != 0)
 		*ap->a_lenp = 0;
 
 	if (error != 0)
 		error = nfscl_maperr(curthread, error, (uid_t)0, (gid_t)0);
 	return (error);
 }
 
 /*
  * nfs ioctl call
  */
 static int
 nfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsvattr nfsva;
 	struct nfsmount *nmp;
 	int attrflag, content, error, ret;
 	bool eof = false;			/* shut up compiler. */
 
 	if (vp->v_type != VREG)
 		return (ENOTTY);
 	nmp = VFSTONFS(vp->v_mount);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION) {
 		error = vop_stdioctl(ap);
 		return (error);
 	}
 
 	/* Do the actual NFSv4.2 RPC. */
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 		content = NFSV4CONTENT_DATA;
 		break;
 	case FIOSEEKHOLE:
 		content = NFSV4CONTENT_HOLE;
 		break;
 	default:
 		return (ENOTTY);
 	}
 
 	error = vn_lock(vp, LK_SHARED);
 	if (error != 0)
 		return (EBADF);
 	attrflag = 0;
 	if (*((off_t *)ap->a_data) >= VTONFS(vp)->n_size)
 		error = ENXIO;
 	else {
 		/*
 		 * Flush all writes, so that the server is up to date.
 		 * Although a Commit is not required, the commit argument
 		 * is set so that, for a pNFS File/Flexible File Layout
 		 * server, the LayoutCommit will be done to ensure the file
 		 * size is up to date on the Metadata Server.
 		 */
 		error = ncl_flush(vp, MNT_WAIT, ap->a_td, 1, 0);
 		if (error == 0)
 			error = nfsrpc_seek(vp, (off_t *)ap->a_data, &eof,
 			    content, ap->a_cred, &nfsva, &attrflag);
 		/* If at eof for FIOSEEKDATA, return ENXIO. */
 		if (eof && error == 0 && content == NFSV4CONTENT_DATA)
 			error = ENXIO;
 	}
 	if (attrflag != 0) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (error == 0 && ret != 0)
 			error = ret;
 	}
 	NFSVOPUNLOCK(vp, 0);
 
 	if (error != 0)
 		error = ENXIO;
 	return (error);
 }
 
 /*
  * nfs getextattr call
  */
 static int
 nfs_getextattr(struct vop_getextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsmount *nmp;
 	struct ucred *cred;
 	struct thread *td = ap->a_td;
 	struct nfsvattr nfsva;
 	ssize_t len;
 	int attrflag, error, ret;
 
 	nmp = VFSTONFS(vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 ||
 	    ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EOPNOTSUPP);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	cred = ap->a_cred;
 	if (cred == NULL)
 		cred = td->td_ucred;
 	/* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */
 	attrflag = 0;
 	error = nfsrpc_getextattr(vp, ap->a_name, ap->a_uio, &len, &nfsva,
 	    &attrflag, cred, td);
 	if (attrflag != 0) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (error == 0 && ret != 0)
 			error = ret;
 	}
 	if (error == 0 && ap->a_size != NULL)
 		*ap->a_size = len;
 
 	switch (error) {
 	case NFSERR_NOTSUPP:
 	case NFSERR_OPILLEGAL:
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_NOXATTR;
 		mtx_unlock(&nmp->nm_mtx);
 		error = EOPNOTSUPP;
 		break;
 	case NFSERR_NOXATTR:
 	case NFSERR_XATTR2BIG:
 		error = ENOATTR;
 		break;
 	default:
 		error = nfscl_maperr(td, error, 0, 0);
 		break;
 	}
 	return (error);
 }
 
 /*
  * nfs setextattr call
  */
 static int
 nfs_setextattr(struct vop_setextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsmount *nmp;
 	struct ucred *cred;
 	struct thread *td = ap->a_td;
 	struct nfsvattr nfsva;
 	int attrflag, error, ret;
 
 	nmp = VFSTONFS(vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 ||
 	    ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EOPNOTSUPP);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	if (ap->a_uio->uio_resid <= 0)
 		return (EINVAL);
 	cred = ap->a_cred;
 	if (cred == NULL)
 		cred = td->td_ucred;
 	/* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */
 	attrflag = 0;
 	error = nfsrpc_setextattr(vp, ap->a_name, ap->a_uio, &nfsva,
 	    &attrflag, cred, td);
 	if (attrflag != 0) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (error == 0 && ret != 0)
 			error = ret;
 	}
 
 	switch (error) {
 	case NFSERR_NOTSUPP:
 	case NFSERR_OPILLEGAL:
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_NOXATTR;
 		mtx_unlock(&nmp->nm_mtx);
 		error = EOPNOTSUPP;
 		break;
 	case NFSERR_NOXATTR:
 	case NFSERR_XATTR2BIG:
 		error = ENOATTR;
 		break;
 	default:
 		error = nfscl_maperr(td, error, 0, 0);
 		break;
 	}
 	return (error);
 }
 
 /*
  * nfs listextattr call
  */
 static int
 nfs_listextattr(struct vop_listextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsmount *nmp;
 	struct ucred *cred;
 	struct thread *td = ap->a_td;
 	struct nfsvattr nfsva;
 	size_t len, len2;
 	uint64_t cookie;
 	int attrflag, error, ret;
 	bool eof;
 
 	nmp = VFSTONFS(vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 ||
 	    ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EOPNOTSUPP);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	cred = ap->a_cred;
 	if (cred == NULL)
 		cred = td->td_ucred;
 
 	/* Loop around doing List Extended Attribute RPCs. */
 	eof = false;
 	cookie = 0;
 	len2 = 0;
 	error = 0;
 	while (!eof && error == 0) {
 		len = nmp->nm_rsize;
 		attrflag = 0;
 		error = nfsrpc_listextattr(vp, &cookie, ap->a_uio, &len, &eof,
 		    &nfsva, &attrflag, cred, td);
 		if (attrflag != 0) {
 			ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 			if (error == 0 && ret != 0)
 				error = ret;
 		}
 		if (error == 0) {
 			len2 += len;
 			if (len2 > SSIZE_MAX)
 				error = ENOATTR;
 		}
 	}
 	if (error == 0 && ap->a_size != NULL)
 		*ap->a_size = len2;
 
 	switch (error) {
 	case NFSERR_NOTSUPP:
 	case NFSERR_OPILLEGAL:
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_NOXATTR;
 		mtx_unlock(&nmp->nm_mtx);
 		error = EOPNOTSUPP;
 		break;
 	case NFSERR_NOXATTR:
 	case NFSERR_XATTR2BIG:
 		error = ENOATTR;
 		break;
 	default:
 		error = nfscl_maperr(td, error, 0, 0);
 		break;
 	}
 	return (error);
 }
 
 /*
  * nfs setextattr call
  */
 static int
 nfs_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct nfsmount *nmp;
 	struct nfsvattr nfsva;
 	int attrflag, error, ret;
 
 	nmp = VFSTONFS(vp->v_mount);
 	mtx_lock(&nmp->nm_mtx);
 	if (!NFSHASNFSV4(nmp) || nmp->nm_minorvers < NFSV42_MINORVERSION ||
 	    (nmp->nm_privflag & NFSMNTP_NOXATTR) != 0 ||
 	    ap->a_attrnamespace != EXTATTR_NAMESPACE_USER) {
 		mtx_unlock(&nmp->nm_mtx);
 		return (EOPNOTSUPP);
 	}
 	mtx_unlock(&nmp->nm_mtx);
 
 	/* Do the actual NFSv4.2 Optional Extended Attribute (RFC-8276) RPC. */
 	attrflag = 0;
 	error = nfsrpc_rmextattr(vp, ap->a_name, &nfsva, &attrflag, ap->a_cred,
 	    ap->a_td);
 	if (attrflag != 0) {
 		ret = nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1);
 		if (error == 0 && ret != 0)
 			error = ret;
 	}
 
 	switch (error) {
 	case NFSERR_NOTSUPP:
 	case NFSERR_OPILLEGAL:
 		mtx_lock(&nmp->nm_mtx);
 		nmp->nm_privflag |= NFSMNTP_NOXATTR;
 		mtx_unlock(&nmp->nm_mtx);
 		error = EOPNOTSUPP;
 		break;
 	case NFSERR_NOXATTR:
 	case NFSERR_XATTR2BIG:
 		error = ENOATTR;
 		break;
 	default:
 		error = nfscl_maperr(ap->a_td, error, 0, 0);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to nfs filesystems.
  */
 static int
 nfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct nfsv3_pathconf pc;
 	struct nfsvattr nfsva;
 	struct vnode *vp = ap->a_vp;
 	struct nfsmount *nmp;
 	struct thread *td = curthread;
 	off_t off;
 	bool eof;
 	int attrflag, error;
 
 	if ((NFS_ISV34(vp) && (ap->a_name == _PC_LINK_MAX ||
 	    ap->a_name == _PC_NAME_MAX || ap->a_name == _PC_CHOWN_RESTRICTED ||
 	    ap->a_name == _PC_NO_TRUNC)) ||
 	    (NFS_ISV4(vp) && ap->a_name == _PC_ACL_NFS4)) {
 		/*
 		 * Since only the above 4 a_names are returned by the NFSv3
 		 * Pathconf RPC, there is no point in doing it for others.
 		 * For NFSv4, the Pathconf RPC (actually a Getattr Op.) can
 		 * be used for _PC_NFS4_ACL as well.
 		 */
 		error = nfsrpc_pathconf(vp, &pc, td->td_ucred, td, &nfsva,
 		    &attrflag, NULL);
 		if (attrflag != 0)
 			(void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0,
 			    1);
 		if (error != 0)
 			return (error);
 	} else {
 		/*
 		 * For NFSv2 (or NFSv3 when not one of the above 4 a_names),
 		 * just fake them.
 		 */
 		pc.pc_linkmax = NFS_LINK_MAX;
 		pc.pc_namemax = NFS_MAXNAMLEN;
 		pc.pc_notrunc = 1;
 		pc.pc_chownrestricted = 1;
 		pc.pc_caseinsensitive = 0;
 		pc.pc_casepreserving = 1;
 		error = 0;
 	}
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 #ifdef _LP64
 		*ap->a_retval = pc.pc_linkmax;
 #else
 		*ap->a_retval = MIN(LONG_MAX, pc.pc_linkmax);
 #endif
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = pc.pc_namemax;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = pc.pc_chownrestricted;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = pc.pc_notrunc;
 		break;
 	case _PC_ACL_NFS4:
 		if (NFS_ISV4(vp) && nfsrv_useacl != 0 && attrflag != 0 &&
 		    NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL))
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (NFS_ISV4(vp))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		if (NFS_ISV34(vp))
 			*ap->a_retval = 64;
 		else
 			*ap->a_retval = 32;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = NFS_MAXPATHLEN;
 		break;
 	case _PC_MIN_HOLE_SIZE:
 		/* Only some NFSv4.2 servers support Seek for Holes. */
 		*ap->a_retval = 0;
 		nmp = VFSTONFS(vp->v_mount);
 		if (NFS_ISV4(vp) && nmp->nm_minorvers == NFSV42_MINORVERSION) {
 			/*
 			 * NFSv4.2 doesn't have an attribute for hole size,
 			 * so all we can do is see if the Seek operation is
 			 * supported and then use f_iosize as a "best guess".
 			 */
 			mtx_lock(&nmp->nm_mtx);
 			if ((nmp->nm_privflag & NFSMNTP_SEEKTESTED) == 0) {
 				mtx_unlock(&nmp->nm_mtx);
 				off = 0;
 				attrflag = 0;
 				error = nfsrpc_seek(vp, &off, &eof,
 				    NFSV4CONTENT_HOLE, td->td_ucred, &nfsva,
 				    &attrflag);
 				if (attrflag != 0)
 					nfscl_loadattrcache(&vp, &nfsva,
 					    NULL, NULL, 0, 1);
 				mtx_lock(&nmp->nm_mtx);
 				if (error == NFSERR_NOTSUPP)
 					nmp->nm_privflag |= NFSMNTP_SEEKTESTED;
 				else
 					nmp->nm_privflag |= NFSMNTP_SEEKTESTED |
 					    NFSMNTP_SEEK;
 				error = 0;
 			}
 			if ((nmp->nm_privflag & NFSMNTP_SEEK) != 0)
 				*ap->a_retval = vp->v_mount->mnt_stat.f_iosize;
 			mtx_unlock(&nmp->nm_mtx);
 		}
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
Index: head/sys/fs/nullfs/null_vnops.c
===================================================================
--- head/sys/fs/nullfs/null_vnops.c	(revision 355789)
+++ head/sys/fs/nullfs/null_vnops.c	(revision 355790)
@@ -1,945 +1,946 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * John Heidemann of the UCLA Ficus project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)null_vnops.c	8.6 (Berkeley) 5/27/95
  *
  * Ancestors:
  *	@(#)lofs_vnops.c	1.2 (Berkeley) 6/18/92
  *	...and...
  *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
  *
  * $FreeBSD$
  */
 
 /*
  * Null Layer
  *
  * (See mount_nullfs(8) for more information.)
  *
  * The null layer duplicates a portion of the filesystem
  * name space under a new name.  In this respect, it is
  * similar to the loopback filesystem.  It differs from
  * the loopback fs in two respects:  it is implemented using
  * a stackable layers techniques, and its "null-node"s stack above
  * all lower-layer vnodes, not just over directory vnodes.
  *
  * The null layer has two purposes.  First, it serves as a demonstration
  * of layering by proving a layer which does nothing.  (It actually
  * does everything the loopback filesystem does, which is slightly
  * more than nothing.)  Second, the null layer can serve as a prototype
  * layer.  Since it provides all necessary layer framework,
  * new filesystem layers can be created very easily be starting
  * with a null layer.
  *
  * The remainder of this man page examines the null layer as a basis
  * for constructing new layers.
  *
  *
  * INSTANTIATING NEW NULL LAYERS
  *
  * New null layers are created with mount_nullfs(8).
  * Mount_nullfs(8) takes two arguments, the pathname
  * of the lower vfs (target-pn) and the pathname where the null
  * layer will appear in the namespace (alias-pn).  After
  * the null layer is put into place, the contents
  * of target-pn subtree will be aliased under alias-pn.
  *
  *
  * OPERATION OF A NULL LAYER
  *
  * The null layer is the minimum filesystem layer,
  * simply bypassing all possible operations to the lower layer
  * for processing there.  The majority of its activity centers
  * on the bypass routine, through which nearly all vnode operations
  * pass.
  *
  * The bypass routine accepts arbitrary vnode operations for
  * handling by the lower layer.  It begins by examing vnode
  * operation arguments and replacing any null-nodes by their
  * lower-layer equivlants.  It then invokes the operation
  * on the lower layer.  Finally, it replaces the null-nodes
  * in the arguments and, if a vnode is return by the operation,
  * stacks a null-node on top of the returned vnode.
  *
  * Although bypass handles most operations, vop_getattr, vop_lock,
  * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not
  * bypassed. Vop_getattr must change the fsid being returned.
  * Vop_lock and vop_unlock must handle any locking for the
  * current vnode as well as pass the lock request down.
  * Vop_inactive and vop_reclaim are not bypassed so that
  * they can handle freeing null-layer specific data. Vop_print
  * is not bypassed to avoid excessive debugging information.
  * Also, certain vnode operations change the locking state within
  * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
  * and symlink). Ideally these operations should not change the
  * lock state, but should be changed to let the caller of the
  * function unlock them. Otherwise all intermediate vnode layers
  * (such as union, umapfs, etc) must catch these functions to do
  * the necessary locking at their layer.
  *
  *
  * INSTANTIATING VNODE STACKS
  *
  * Mounting associates the null layer with a lower layer,
  * effect stacking two VFSes.  Vnode stacks are instead
  * created on demand as files are accessed.
  *
  * The initial mount creates a single vnode stack for the
  * root of the new null layer.  All other vnode stacks
  * are created as a result of vnode operations on
  * this or other null vnode stacks.
  *
  * New vnode stacks come into existence as a result of
  * an operation which returns a vnode.
  * The bypass routine stacks a null-node above the new
  * vnode before returning it to the caller.
  *
  * For example, imagine mounting a null layer with
  * "mount_nullfs /usr/include /dev/layer/null".
  * Changing directory to /dev/layer/null will assign
  * the root null-node (which was created when the null layer was mounted).
  * Now consider opening "sys".  A vop_lookup would be
  * done on the root null-node.  This operation would bypass through
  * to the lower layer which would return a vnode representing
  * the UFS "sys".  Null_bypass then builds a null-node
  * aliasing the UFS "sys" and returns this to the caller.
  * Later operations on the null-node "sys" will repeat this
  * process when constructing other vnode stacks.
  *
  *
  * CREATING OTHER FILE SYSTEM LAYERS
  *
  * One of the easiest ways to construct new filesystem layers is to make
  * a copy of the null layer, rename all files and variables, and
  * then begin modifing the copy.  Sed can be used to easily rename
  * all variables.
  *
  * The umap layer is an example of a layer descended from the
  * null layer.
  *
  *
  * INVOKING OPERATIONS ON LOWER LAYERS
  *
  * There are two techniques to invoke operations on a lower layer
  * when the operation cannot be completely bypassed.  Each method
  * is appropriate in different situations.  In both cases,
  * it is the responsibility of the aliasing layer to make
  * the operation arguments "correct" for the lower layer
  * by mapping a vnode arguments to the lower layer.
  *
  * The first approach is to call the aliasing layer's bypass routine.
  * This method is most suitable when you wish to invoke the operation
  * currently being handled on the lower layer.  It has the advantage
  * that the bypass routine already must do argument mapping.
  * An example of this is null_getattrs in the null layer.
  *
  * A second approach is to directly invoke vnode operations on
  * the lower layer with the VOP_OPERATIONNAME interface.
  * The advantage of this method is that it is easy to invoke
  * arbitrary operations on the lower layer.  The disadvantage
  * is that vnode arguments must be manualy mapped.
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/nullfs/null.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 
 static int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
 SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, 
 	&null_bug_bypass, 0, "");
 
 /*
  * This is the 10-Apr-92 bypass routine.
  *    This version has been optimized for speed, throwing away some
  * safety checks.  It should still always work, but it's not as
  * robust to programmer errors.
  *
  * In general, we map all vnodes going down and unmap them on the way back.
  * As an exception to this, vnodes can be marked "unmapped" by setting
  * the Nth bit in operation's vdesc_flags.
  *
  * Also, some BSD vnode operations have the side effect of vrele'ing
  * their arguments.  With stacking, the reference counts are held
  * by the upper node, not the lower one, so we must handle these
  * side-effects here.  This is not of concern in Sun-derived systems
  * since there are no such side-effects.
  *
  * This makes the following assumptions:
  * - only one returned vpp
  * - no INOUT vpp's (Sun's vop_open has one of these)
  * - the vnode operation vector of the first vnode should be used
  *   to determine what implementation of the op should be invoked
  * - all mapped vnodes are of our vnode-type (NEEDSWORK:
  *   problems on rmdir'ing mount points and renaming?)
  */
 int
 null_bypass(struct vop_generic_args *ap)
 {
 	struct vnode **this_vp_p;
 	int error;
 	struct vnode *old_vps[VDESC_MAX_VPS];
 	struct vnode **vps_p[VDESC_MAX_VPS];
 	struct vnode ***vppp;
 	struct vnodeop_desc *descp = ap->a_desc;
 	int reles, i;
 
 	if (null_bug_bypass)
 		printf ("null_bypass: %s\n", descp->vdesc_name);
 
 #ifdef DIAGNOSTIC
 	/*
 	 * We require at least one vp.
 	 */
 	if (descp->vdesc_vp_offsets == NULL ||
 	    descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
 		panic ("null_bypass: no vp's in map");
 #endif
 
 	/*
 	 * Map the vnodes going in.
 	 * Later, we'll invoke the operation based on
 	 * the first mapped vnode's operation vector.
 	 */
 	reles = descp->vdesc_flags;
 	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
 		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
 			break;   /* bail out at end of list */
 		vps_p[i] = this_vp_p =
 			VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap);
 		/*
 		 * We're not guaranteed that any but the first vnode
 		 * are of our type.  Check for and don't map any
 		 * that aren't.  (We must always map first vp or vclean fails.)
 		 */
 		if (i && (*this_vp_p == NULLVP ||
 		    (*this_vp_p)->v_op != &null_vnodeops)) {
 			old_vps[i] = NULLVP;
 		} else {
 			old_vps[i] = *this_vp_p;
 			*(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p);
 			/*
 			 * XXX - Several operations have the side effect
 			 * of vrele'ing their vp's.  We must account for
 			 * that.  (This should go away in the future.)
 			 */
 			if (reles & VDESC_VP0_WILLRELE)
 				VREF(*this_vp_p);
 		}
 
 	}
 
 	/*
 	 * Call the operation on the lower layer
 	 * with the modified argument structure.
 	 */
 	if (vps_p[0] && *vps_p[0])
 		error = VCALL(ap);
 	else {
 		printf("null_bypass: no map for %s\n", descp->vdesc_name);
 		error = EINVAL;
 	}
 
 	/*
 	 * Maintain the illusion of call-by-value
 	 * by restoring vnodes in the argument structure
 	 * to their original value.
 	 */
 	reles = descp->vdesc_flags;
 	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
 		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
 			break;   /* bail out at end of list */
 		if (old_vps[i]) {
 			*(vps_p[i]) = old_vps[i];
 #if 0
 			if (reles & VDESC_VP0_WILLUNLOCK)
 				VOP_UNLOCK(*(vps_p[i]), 0);
 #endif
 			if (reles & VDESC_VP0_WILLRELE)
 				vrele(*(vps_p[i]));
 		}
 	}
 
 	/*
 	 * Map the possible out-going vpp
 	 * (Assumes that the lower layer always returns
 	 * a VREF'ed vpp unless it gets an error.)
 	 */
 	if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET &&
 	    !(descp->vdesc_flags & VDESC_NOMAP_VPP) &&
 	    !error) {
 		/*
 		 * XXX - even though some ops have vpp returned vp's,
 		 * several ops actually vrele this before returning.
 		 * We must avoid these ops.
 		 * (This should go away when these ops are regularized.)
 		 */
 		if (descp->vdesc_flags & VDESC_VPP_WILLRELE)
 			goto out;
 		vppp = VOPARG_OFFSETTO(struct vnode***,
 				 descp->vdesc_vpp_offset,ap);
 		if (*vppp)
 			error = null_nodeget(old_vps[0]->v_mount, **vppp, *vppp);
 	}
 
  out:
 	return (error);
 }
 
 static int
 null_add_writecount(struct vop_add_writecount_args *ap)
 {
 	struct vnode *lvp, *vp;
 	int error;
 
 	vp = ap->a_vp;
 	lvp = NULLVPTOLOWERVP(vp);
 	VI_LOCK(vp);
 	/* text refs are bypassed to lowervp */
 	VNASSERT(vp->v_writecount >= 0, vp, ("wrong null writecount"));
 	VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
 	    ("wrong writecount inc %d", ap->a_inc));
 	error = VOP_ADD_WRITECOUNT(lvp, ap->a_inc);
 	if (error == 0)
 		vp->v_writecount += ap->a_inc;
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 /*
  * We have to carry on the locking protocol on the null layer vnodes
  * as we progress through the tree. We also have to enforce read-only
  * if this layer is mounted read-only.
  */
 static int
 null_lookup(struct vop_lookup_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode *dvp = ap->a_dvp;
 	int flags = cnp->cn_flags;
 	struct vnode *vp, *ldvp, *lvp;
 	struct mount *mp;
 	int error;
 
 	mp = dvp->v_mount;
 	if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 	/*
 	 * Although it is possible to call null_bypass(), we'll do
 	 * a direct call to reduce overhead
 	 */
 	ldvp = NULLVPTOLOWERVP(dvp);
 	vp = lvp = NULL;
 	KASSERT((ldvp->v_vflag & VV_ROOT) == 0 ||
 	    ((dvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) == 0),
 	    ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag,
 	     dvp, dvp->v_vflag, flags));
 
 	/*
 	 * Hold ldvp.  The reference on it, owned by dvp, is lost in
 	 * case of dvp reclamation, and we need ldvp to move our lock
 	 * from ldvp to dvp.
 	 */
 	vhold(ldvp);
 
 	error = VOP_LOOKUP(ldvp, &lvp, cnp);
 
 	/*
 	 * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows
 	 * dvp to be reclaimed due to shared v_vnlock.  Check for the
 	 * doomed state and return error.
 	 */
 	if ((error == 0 || error == EJUSTRETURN) &&
 	    VN_IS_DOOMED(dvp)) {
 		error = ENOENT;
 		if (lvp != NULL)
 			vput(lvp);
 
 		/*
 		 * If vgone() did reclaimed dvp before curthread
 		 * relocked ldvp, the locks of dvp and ldpv are no
 		 * longer shared.  In this case, relock of ldvp in
 		 * lower fs VOP_LOOKUP() does not restore the locking
 		 * state of dvp.  Compensate for this by unlocking
 		 * ldvp and locking dvp, which is also correct if the
 		 * locks are still shared.
 		 */
 		VOP_UNLOCK(ldvp, 0);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 	}
 	vdrop(ldvp);
 
 	if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 &&
 	    (mp->mnt_flag & MNT_RDONLY) != 0 &&
 	    (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
 		error = EROFS;
 
 	if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) {
 		if (ldvp == lvp) {
 			*ap->a_vpp = dvp;
 			VREF(dvp);
 			vrele(lvp);
 		} else {
 			error = null_nodeget(mp, lvp, &vp);
 			if (error == 0)
 				*ap->a_vpp = vp;
 		}
 	}
 	return (error);
 }
 
 static int
 null_open(struct vop_open_args *ap)
 {
 	int retval;
 	struct vnode *vp, *ldvp;
 
 	vp = ap->a_vp;
 	ldvp = NULLVPTOLOWERVP(vp);
 	retval = null_bypass(&ap->a_gen);
 	if (retval == 0)
 		vp->v_object = ldvp->v_object;
 	return (retval);
 }
 
 /*
  * Setattr call. Disallow write attempts if the layer is mounted read-only.
  */
 static int
 null_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
   	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
 	    (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		case VDIR:
  			return (EISDIR);
  		case VCHR:
  		case VBLK:
  		case VSOCK:
  		case VFIFO:
 			if (vap->va_flags != VNOVAL)
 				return (EOPNOTSUPP);
 			return (0);
 		case VREG:
 		case VLNK:
  		default:
 			/*
 			 * Disallow write attempts if the filesystem is
 			 * mounted read-only.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 		}
 	}
 
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
 /*
  *  We handle getattr only to change the fsid.
  */
 static int
 null_getattr(struct vop_getattr_args *ap)
 {
 	int error;
 
 	if ((error = null_bypass((struct vop_generic_args *)ap)) != 0)
 		return (error);
 
 	ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 	return (0);
 }
 
 /*
  * Handle to disallow write access if mounted read-only.
  */
 static int
 null_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 
 	/*
 	 * Disallow write attempts on read-only layers;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
 static int
 null_accessx(struct vop_accessx_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 
 	/*
 	 * Disallow write attempts on read-only layers;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
 /*
  * Increasing refcount of lower vnode is needed at least for the case
  * when lower FS is NFS to do sillyrename if the file is in use.
  * Unfortunately v_usecount is incremented in many places in
  * the kernel and, as such, there may be races that result in
  * the NFS client doing an extraneous silly rename, but that seems
  * preferable to not doing a silly rename when it is needed.
  */
 static int
 null_remove(struct vop_remove_args *ap)
 {
 	int retval, vreleit;
 	struct vnode *lvp, *vp;
 
 	vp = ap->a_vp;
 	if (vrefcnt(vp) > 1) {
 		lvp = NULLVPTOLOWERVP(vp);
 		VREF(lvp);
 		vreleit = 1;
 	} else
 		vreleit = 0;
 	VTONULL(vp)->null_flags |= NULLV_DROP;
 	retval = null_bypass(&ap->a_gen);
 	if (vreleit != 0)
 		vrele(lvp);
 	return (retval);
 }
 
 /*
  * We handle this to eliminate null FS to lower FS
  * file moving. Don't know why we don't allow this,
  * possibly we should.
  */
 static int
 null_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct null_node *tnn;
 
 	/* Check for cross-device rename. */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (EXDEV);
 	}
 
 	if (tvp != NULL) {
 		tnn = VTONULL(tvp);
 		tnn->null_flags |= NULLV_DROP;
 	}
 	return (null_bypass((struct vop_generic_args *)ap));
 }
 
 static int
 null_rmdir(struct vop_rmdir_args *ap)
 {
 
 	VTONULL(ap->a_vp)->null_flags |= NULLV_DROP;
 	return (null_bypass(&ap->a_gen));
 }
 
 /*
  * We need to process our own vnode lock and then clear the
  * interlock flag as it applies only to our vnode, not the
  * vnodes below us on the stack.
  */
 static int
 null_lock(struct vop_lock1_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int flags;
 	struct null_node *nn;
 	struct vnode *lvp;
 	int error;
 
 	if ((ap->a_flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	else
 		ap->a_flags &= ~LK_INTERLOCK;
 	flags = ap->a_flags;
 	nn = VTONULL(vp);
 	/*
 	 * If we're still active we must ask the lower layer to
 	 * lock as ffs has special lock considerations in its
 	 * vop lock.
 	 */
 	if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) {
 		/*
 		 * We have to hold the vnode here to solve a potential
 		 * reclaim race.  If we're forcibly vgone'd while we
 		 * still have refs, a thread could be sleeping inside
 		 * the lowervp's vop_lock routine.  When we vgone we will
 		 * drop our last ref to the lowervp, which would allow it
 		 * to be reclaimed.  The lowervp could then be recycled,
 		 * in which case it is not legal to be sleeping in its VOP.
 		 * We prevent it from being recycled by holding the vnode
 		 * here.
 		 */
 		vholdnz(lvp);
 		VI_UNLOCK(vp);
 		error = VOP_LOCK(lvp, flags);
 
 		/*
 		 * We might have slept to get the lock and someone might have
 		 * clean our vnode already, switching vnode lock from one in
 		 * lowervp to v_lock in our own vnode structure.  Handle this
 		 * case by reacquiring correct lock in requested mode.
 		 */
 		if (VTONULL(vp) == NULL && error == 0) {
 			ap->a_flags &= ~LK_TYPE_MASK;
 			switch (flags & LK_TYPE_MASK) {
 			case LK_SHARED:
 				ap->a_flags |= LK_SHARED;
 				break;
 			case LK_UPGRADE:
 			case LK_EXCLUSIVE:
 				ap->a_flags |= LK_EXCLUSIVE;
 				break;
 			default:
 				panic("Unsupported lock request %d\n",
 				    ap->a_flags);
 			}
 			VOP_UNLOCK(lvp, 0);
 			error = vop_stdlock(ap);
 		}
 		vdrop(lvp);
 	} else {
 		VI_UNLOCK(vp);
 		error = vop_stdlock(ap);
 	}
 
 	return (error);
 }
 
 /*
  * We need to process our own vnode unlock and then clear the
  * interlock flag as it applies only to our vnode, not the
  * vnodes below us on the stack.
  */
 static int
 null_unlock(struct vop_unlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct null_node *nn;
 	struct vnode *lvp;
 	int error;
 
 	nn = VTONULL(vp);
 	if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) {
 		vholdnz(lvp);
 		error = VOP_UNLOCK(lvp, 0);
 		vdrop(lvp);
 	} else {
 		error = vop_stdunlock(ap);
 	}
 
 	return (error);
 }
 
 /*
  * Do not allow the VOP_INACTIVE to be passed to the lower layer,
  * since the reference count on the lower vnode is not related to
  * ours.
  */
 static int
 null_want_recycle(struct vnode *vp)
 {
 	struct vnode *lvp;
 	struct null_node *xp;
 	struct mount *mp;
 	struct null_mount *xmp;
 
 	xp = VTONULL(vp);
 	lvp = NULLVPTOLOWERVP(vp);
 	mp = vp->v_mount;
 	xmp = MOUNTTONULLMOUNT(mp);
 	if ((xmp->nullm_flags & NULLM_CACHE) == 0 ||
 	    (xp->null_flags & NULLV_DROP) != 0 ||
 	    (lvp->v_vflag & VV_NOSYNC) != 0) {
 		/*
 		 * If this is the last reference and caching of the
 		 * nullfs vnodes is not enabled, or the lower vnode is
 		 * deleted, then free up the vnode so as not to tie up
 		 * the lower vnodes.
 		 */
 		return (1);
 	}
 	return (0);
 }
 
 static int
 null_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	if (null_want_recycle(vp)) {
 		vp->v_object = NULL;
 		vrecycle(vp);
 	}
 	return (0);
 }
 
 static int
 null_need_inactive(struct vop_need_inactive_args *ap)
 {
 
 	return (null_want_recycle(ap->a_vp));
 }
 
 /*
  * Now, the nullfs vnode and, due to the sharing lock, the lower
  * vnode, are exclusively locked, and we shall destroy the null vnode.
  */
 static int
 null_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct null_node *xp;
 	struct vnode *lowervp;
 
 	vp = ap->a_vp;
 	xp = VTONULL(vp);
 	lowervp = xp->null_lowervp;
 
 	KASSERT(lowervp != NULL && vp->v_vnlock != &vp->v_lock,
 	    ("Reclaiming incomplete null vnode %p", vp));
 
 	null_hashrem(xp);
 	/*
 	 * Use the interlock to protect the clearing of v_data to
 	 * prevent faults in null_lock().
 	 */
 	lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
 	VI_LOCK(vp);
 	vp->v_data = NULL;
 	vp->v_object = NULL;
 	vp->v_vnlock = &vp->v_lock;
 
 	/*
 	 * If we were opened for write, we leased the write reference
 	 * to the lower vnode.  If this is a reclamation due to the
 	 * forced unmount, undo the reference now.
 	 */
 	if (vp->v_writecount > 0)
 		VOP_ADD_WRITECOUNT(lowervp, -vp->v_writecount);
 	else if (vp->v_writecount < 0)
 		vp->v_writecount = 0;
 
 	VI_UNLOCK(vp);
 
 	if ((xp->null_flags & NULLV_NOUNLOCK) != 0)
 		vunref(lowervp);
 	else
 		vput(lowervp);
 	free(xp, M_NULLFSNODE);
 
 	return (0);
 }
 
 static int
 null_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	printf("\tvp=%p, lowervp=%p\n", vp, VTONULL(vp)->null_lowervp);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 null_getwritemount(struct vop_getwritemount_args *ap)
 {
 	struct null_node *xp;
 	struct vnode *lowervp;
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	VI_LOCK(vp);
 	xp = VTONULL(vp);
 	if (xp && (lowervp = xp->null_lowervp)) {
 		vholdnz(lowervp);
 		VI_UNLOCK(vp);
 		VOP_GETWRITEMOUNT(lowervp, ap->a_mpp);
 		vdrop(lowervp);
 	} else {
 		VI_UNLOCK(vp);
 		*(ap->a_mpp) = NULL;
 	}
 	return (0);
 }
 
 static int
 null_vptofh(struct vop_vptofh_args *ap)
 {
 	struct vnode *lvp;
 
 	lvp = NULLVPTOLOWERVP(ap->a_vp);
 	return VOP_VPTOFH(lvp, ap->a_fhp);
 }
 
 static int
 null_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct vnode *lvp, *ldvp;
 	struct ucred *cred = ap->a_cred;
 	struct mount *mp;
 	int error, locked;
 
 	locked = VOP_ISLOCKED(vp);
 	lvp = NULLVPTOLOWERVP(vp);
 	vhold(lvp);
 	mp = vp->v_mount;
 	vfs_ref(mp);
 	VOP_UNLOCK(vp, 0); /* vp is held by vn_vptocnp_locked that called us */
 	ldvp = lvp;
 	vref(lvp);
 	error = vn_vptocnp(&ldvp, cred, ap->a_buf, ap->a_buflen);
 	vdrop(lvp);
 	if (error != 0) {
 		vn_lock(vp, locked | LK_RETRY);
 		vfs_rel(mp);
 		return (ENOENT);
 	}
 
 	error = vn_lock(ldvp, LK_SHARED);
 	if (error != 0) {
 		vrele(ldvp);
 		vn_lock(vp, locked | LK_RETRY);
 		vfs_rel(mp);
 		return (ENOENT);
 	}
 	error = null_nodeget(mp, ldvp, dvp);
 	if (error == 0) {
 #ifdef DIAGNOSTIC
 		NULLVPTOLOWERVP(*dvp);
 #endif
 		VOP_UNLOCK(*dvp, 0); /* keep reference on *dvp */
 	}
 	vn_lock(vp, locked | LK_RETRY);
 	vfs_rel(mp);
 	return (error);
 }
 
 /*
  * Global vfs data structures
  */
 struct vop_vector null_vnodeops = {
 	.vop_bypass =		null_bypass,
 	.vop_access =		null_access,
 	.vop_accessx =		null_accessx,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
 	.vop_bmap =		VOP_EOPNOTSUPP,
 	.vop_getattr =		null_getattr,
 	.vop_getwritemount =	null_getwritemount,
 	.vop_inactive =		null_inactive,
 	.vop_need_inactive =	null_need_inactive,
 	.vop_islocked =		vop_stdislocked,
 	.vop_lock1 =		null_lock,
 	.vop_lookup =		null_lookup,
 	.vop_open =		null_open,
 	.vop_print =		null_print,
 	.vop_reclaim =		null_reclaim,
 	.vop_remove =		null_remove,
 	.vop_rename =		null_rename,
 	.vop_rmdir =		null_rmdir,
 	.vop_setattr =		null_setattr,
 	.vop_strategy =		VOP_EOPNOTSUPP,
 	.vop_unlock =		null_unlock,
 	.vop_vptocnp =		null_vptocnp,
 	.vop_vptofh =		null_vptofh,
 	.vop_add_writecount =	null_add_writecount,
 };
+VFS_VOP_VECTOR_REGISTER(null_vnodeops);
Index: head/sys/fs/pseudofs/pseudofs_vnops.c
===================================================================
--- head/sys/fs/pseudofs/pseudofs_vnops.c	(revision 355789)
+++ head/sys/fs/pseudofs/pseudofs_vnops.c	(revision 355790)
@@ -1,1097 +1,1098 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_pseudofs.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/ctype.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/pseudofs/pseudofs_internal.h>
 
 #define KASSERT_PN_IS_DIR(pn)						\
 	KASSERT((pn)->pn_type == pfstype_root ||			\
 	    (pn)->pn_type == pfstype_dir ||				\
 	    (pn)->pn_type == pfstype_procdir,				\
 	    ("%s(): VDIR vnode refers to non-directory pfs_node", __func__))
 
 #define KASSERT_PN_IS_FILE(pn)						\
 	KASSERT((pn)->pn_type == pfstype_file,				\
 	    ("%s(): VREG vnode refers to non-file pfs_node", __func__))
 
 #define KASSERT_PN_IS_LINK(pn)						\
 	KASSERT((pn)->pn_type == pfstype_symlink,			\
 	    ("%s(): VLNK vnode refers to non-link pfs_node", __func__))
 
 /*
  * Returns the fileno, adjusted for target pid
  */
 static uint32_t
 pn_fileno(struct pfs_node *pn, pid_t pid)
 {
 
 	KASSERT(pn->pn_fileno > 0,
 	    ("%s(): no fileno allocated", __func__));
 	if (pid != NO_PID)
 		return (pn->pn_fileno * NO_PID + pid);
 	return (pn->pn_fileno);
 }
 
 /*
  * Returns non-zero if given file is visible to given thread.
  */
 static int
 pfs_visible_proc(struct thread *td, struct pfs_node *pn, struct proc *proc)
 {
 	int visible;
 
 	if (proc == NULL)
 		return (0);
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 
 	visible = ((proc->p_flag & P_WEXIT) == 0);
 	if (visible)
 		visible = (p_cansee(td, proc) == 0);
 	if (visible && pn->pn_vis != NULL)
 		visible = pn_vis(td, proc, pn);
 	if (!visible)
 		return (0);
 	return (1);
 }
 
 static int
 pfs_visible(struct thread *td, struct pfs_node *pn, pid_t pid,
     struct proc **p)
 {
 	struct proc *proc;
 
 	PFS_TRACE(("%s (pid: %d, req: %d)",
 	    pn->pn_name, pid, td->td_proc->p_pid));
 
 	if (p)
 		*p = NULL;
 	if (pid == NO_PID)
 		PFS_RETURN (1);
 	proc = pfind(pid);
 	if (proc == NULL)
 		PFS_RETURN (0);
 	if (pfs_visible_proc(td, pn, proc)) {
 		if (p)
 			*p = proc;
 		else
 			PROC_UNLOCK(proc);
 		PFS_RETURN (1);
 	}
 	PROC_UNLOCK(proc);
 	PFS_RETURN (0);
 }
 
 static int
 pfs_lookup_proc(pid_t pid, struct proc **p)
 {
 	struct proc *proc;
 
 	proc = pfind(pid);
 	if (proc == NULL)
 		return (0);
 	if ((proc->p_flag & P_WEXIT) != 0) {
 		PROC_UNLOCK(proc);
 		return (0);
 	}
 	_PHOLD(proc);
 	PROC_UNLOCK(proc);
 	*p = proc;
 	return (1);
 }
 
 /*
  * Verify permissions
  */
 static int
 pfs_access(struct vop_access_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct vattr vattr;
 	int error;
 
 	PFS_TRACE(("%s", pvd->pvd_pn->pn_name));
 	(void)pvd;
 
 	error = VOP_GETATTR(vn, &vattr, va->a_cred);
 	if (error)
 		PFS_RETURN (error);
 	error = vaccess(vn->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, va->a_accmode, va->a_cred, NULL);
 	PFS_RETURN (error);
 }
 
 /*
  * Close a file or directory
  */
 static int
 pfs_close(struct vop_close_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * Do nothing unless this is the last close and the node has a
 	 * last-close handler.
 	 */
 	if (vrefcnt(vn) > 1 || pn->pn_close == NULL)
 		PFS_RETURN (0);
 
 	if (pvd->pvd_pid != NO_PID) {
 		proc = pfind(pvd->pvd_pid);
 	} else {
 		proc = NULL;
 	}
 
 	error = pn_close(va->a_td, proc, pn);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Get file attributes
  */
 static int
 pfs_getattr(struct vop_getattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct vattr *vap = va->a_vap;
 	struct proc *proc;
 	int error = 0;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (ENOENT);
 
 	vap->va_type = vn->v_type;
 	vap->va_fileid = pn_fileno(pn, pvd->pvd_pid);
 	vap->va_flags = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_filerev = 0;
 	vap->va_fsid = vn->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_nlink = 1;
 	nanotime(&vap->va_ctime);
 	vap->va_atime = vap->va_mtime = vap->va_ctime;
 
 	switch (pn->pn_type) {
 	case pfstype_procdir:
 	case pfstype_root:
 	case pfstype_dir:
 #if 0
 		pfs_lock(pn);
 		/* compute link count */
 		pfs_unlock(pn);
 #endif
 		vap->va_mode = 0555;
 		break;
 	case pfstype_file:
 	case pfstype_symlink:
 		vap->va_mode = 0444;
 		break;
 	default:
 		printf("shouldn't be here!\n");
 		vap->va_mode = 0;
 		break;
 	}
 
 	if (proc != NULL) {
 		vap->va_uid = proc->p_ucred->cr_ruid;
 		vap->va_gid = proc->p_ucred->cr_rgid;
 	} else {
 		vap->va_uid = 0;
 		vap->va_gid = 0;
 	}
 
 	if (pn->pn_attr != NULL)
 		error = pn_attr(curthread, proc, pn, vap);
 
 	if(proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Perform an ioctl
  */
 static int
 pfs_ioctl(struct vop_ioctl_args *va)
 {
 	struct vnode *vn;
 	struct pfs_vdata *pvd;
 	struct pfs_node *pn;
 	struct proc *proc;
 	int error;
 
 	vn = va->a_vp;
 	vn_lock(vn, LK_SHARED | LK_RETRY);
 	if (VN_IS_DOOMED(vn)) {
 		VOP_UNLOCK(vn, 0);
 		return (EBADF);
 	}
 	pvd = vn->v_data;
 	pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s: %lx", pn->pn_name, va->a_command));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (EINVAL);
 	}
 	KASSERT_PN_IS_FILE(pn);
 
 	if (pn->pn_ioctl == NULL) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (ENOTTY);
 	}
 
 	/*
 	 * This is necessary because process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc)) {
 		VOP_UNLOCK(vn, 0);
 		PFS_RETURN (EIO);
 	}
 
 	error = pn_ioctl(curthread, proc, pn, va->a_command, va->a_data);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	VOP_UNLOCK(vn, 0);
 	PFS_RETURN (error);
 }
 
 /*
  * Perform getextattr
  */
 static int
 pfs_getextattr(struct vop_getextattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct proc *proc;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 
 	if (pn->pn_getextattr == NULL)
 		error = EOPNOTSUPP;
 	else
 		error = pn_getextattr(curthread, proc, pn,
 		    va->a_attrnamespace, va->a_name, va->a_uio,
 		    va->a_size, va->a_cred);
 
 	if (proc != NULL)
 		PROC_UNLOCK(proc);
 
 	PFS_RETURN (error);
 }
 
 /*
  * Convert a vnode to its component name
  */
 static int
 pfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct pfs_vdata *pvd = vp->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	struct pfs_node *pn;
 	struct mount *mp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	char pidbuf[PFS_NAMELEN];
 	pid_t pid = pvd->pvd_pid;
 	int len, i, error, locked;
 
 	i = *buflen;
 	error = 0;
 
 	pfs_lock(pd);
 
 	if (vp->v_type == VDIR && pd->pn_type == pfstype_root) {
 		*dvp = vp;
 		vhold(*dvp);
 		pfs_unlock(pd);
 		PFS_RETURN (0);
 	} else if (vp->v_type == VDIR && pd->pn_type == pfstype_procdir) {
 		len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
 		i -= len;
 		if (i < 0) {
 			error = ENOMEM;
 			goto failed;
 		}
 		bcopy(pidbuf, buf + i, len);
 	} else {
 		len = strlen(pd->pn_name);
 		i -= len;
 		if (i < 0) {
 			error = ENOMEM;
 			goto failed;
 		}
 		bcopy(pd->pn_name, buf + i, len);
 	}
 
 	pn = pd->pn_parent;
 	pfs_unlock(pd);
 
 	mp = vp->v_mount;
 	error = vfs_busy(mp, 0);
 	if (error)
 		return (error);
 
 	/*
 	 * vp is held by caller.
 	 */
 	locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	error = pfs_vncache_alloc(mp, dvp, pn, pid);
 	if (error) {
 		vn_lock(vp, locked | LK_RETRY);
 		vfs_unbusy(mp);
 		PFS_RETURN(error);
 	}
 
 	*buflen = i;
 	VOP_UNLOCK(*dvp, 0);
 	vn_lock(vp, locked | LK_RETRY);
 	vfs_unbusy(mp);
 
 	PFS_RETURN (0);
 failed:
 	pfs_unlock(pd);
 	PFS_RETURN(error);
 }
 
 /*
  * Look up a file or directory
  */
 static int
 pfs_lookup(struct vop_cachedlookup_args *va)
 {
 	struct vnode *vn = va->a_dvp;
 	struct vnode **vpp = va->a_vpp;
 	struct componentname *cnp = va->a_cnp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	struct pfs_node *pn, *pdn = NULL;
 	struct mount *mp;
 	pid_t pid = pvd->pvd_pid;
 	char *pname;
 	int error, i, namelen, visible;
 
 	PFS_TRACE(("%.*s", (int)cnp->cn_namelen, cnp->cn_nameptr));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 	KASSERT_PN_IS_DIR(pd);
 
 	error = VOP_ACCESS(vn, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		PFS_RETURN (error);
 
 	/*
 	 * Don't support DELETE or RENAME.  CREATE is supported so
 	 * that O_CREAT will work, but the lookup will still fail if
 	 * the file does not exist.
 	 */
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		PFS_RETURN (EOPNOTSUPP);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= PFS_NAMELEN)
 		PFS_RETURN (ENOENT);
 
 	/* check that parent directory is visible... */
 	if (!pfs_visible(curthread, pd, pvd->pvd_pid, NULL))
 		PFS_RETURN (ENOENT);
 
 	/* self */
 	namelen = cnp->cn_namelen;
 	pname = cnp->cn_nameptr;
 	if (namelen == 1 && pname[0] == '.') {
 		pn = pd;
 		*vpp = vn;
 		VREF(vn);
 		PFS_RETURN (0);
 	}
 
 	mp = vn->v_mount;
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (pd->pn_type == pfstype_root)
 			PFS_RETURN (EIO);
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			VOP_UNLOCK(vn, 0);
 			error = vfs_busy(mp, 0);
 			vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 			vfs_rel(mp);
 			if (error != 0)
 				PFS_RETURN(ENOENT);
 			if (VN_IS_DOOMED(vn)) {
 				vfs_unbusy(mp);
 				PFS_RETURN(ENOENT);
 			}
 		}
 		VOP_UNLOCK(vn, 0);
 		KASSERT(pd->pn_parent != NULL,
 		    ("%s(): non-root directory has no parent", __func__));
 		/*
 		 * This one is tricky.  Descendents of procdir nodes
 		 * inherit their parent's process affinity, but
 		 * there's no easy reverse mapping.  For simplicity,
 		 * we assume that if this node is a procdir, its
 		 * parent isn't (which is correct as long as
 		 * descendents of procdir nodes are never procdir
 		 * nodes themselves)
 		 */
 		if (pd->pn_type == pfstype_procdir)
 			pid = NO_PID;
 		pfs_lock(pd);
 		pn = pd->pn_parent;
 		pfs_unlock(pd);
 		goto got_pnode;
 	}
 
 	pfs_lock(pd);
 
 	/* named node */
 	for (pn = pd->pn_nodes; pn != NULL; pn = pn->pn_next)
 		if (pn->pn_type == pfstype_procdir)
 			pdn = pn;
 		else if (pn->pn_name[namelen] == '\0' &&
 		    bcmp(pname, pn->pn_name, namelen) == 0) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 
 	/* process dependent node */
 	if ((pn = pdn) != NULL) {
 		pid = 0;
 		for (pid = 0, i = 0; i < namelen && isdigit(pname[i]); ++i)
 			if ((pid = pid * 10 + pname[i] - '0') > PID_MAX)
 				break;
 		if (i == cnp->cn_namelen) {
 			pfs_unlock(pd);
 			goto got_pnode;
 		}
 	}
 
 	pfs_unlock(pd);
 
 	PFS_RETURN (ENOENT);
 
  got_pnode:
 	pfs_assert_not_owned(pd);
 	pfs_assert_not_owned(pn);
 	visible = pfs_visible(curthread, pn, pid, NULL);
 	if (!visible) {
 		error = ENOENT;
 		goto failed;
 	}
 
 	error = pfs_vncache_alloc(mp, vpp, pn, pid);
 	if (error)
 		goto failed;
 
 	if (cnp->cn_flags & ISDOTDOT) {
 		vfs_unbusy(mp);
 		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 		if (VN_IS_DOOMED(vn)) {
 			vput(*vpp);
 			*vpp = NULL;
 			PFS_RETURN(ENOENT);
 		}
 	}
 	if (cnp->cn_flags & MAKEENTRY && !VN_IS_DOOMED(vn))
 		cache_enter(vn, *vpp, cnp);
 	PFS_RETURN (0);
  failed:
 	if (cnp->cn_flags & ISDOTDOT) {
 		vfs_unbusy(mp);
 		vn_lock(vn, LK_EXCLUSIVE | LK_RETRY);
 		*vpp = NULL;
 	}
 	PFS_RETURN(error);
 }
 
 /*
  * Open a file or directory.
  */
 static int
 pfs_open(struct vop_open_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	int mode = va->a_mode;
 
 	PFS_TRACE(("%s (mode 0x%x)", pn->pn_name, mode));
 	pfs_assert_not_owned(pn);
 
 	/* check if the requested mode is permitted */
 	if (((mode & FREAD) && !(mode & PFS_RD)) ||
 	    ((mode & FWRITE) && !(mode & PFS_WR)))
 		PFS_RETURN (EPERM);
 
 	/* we don't support locking */
 	if ((mode & O_SHLOCK) || (mode & O_EXLOCK))
 		PFS_RETURN (EOPNOTSUPP);
 
 	PFS_RETURN (0);
 }
 
 /*
  * Read from a file
  */
 static int
 pfs_read(struct vop_read_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf *sb = NULL;
 	int error, locked;
 	off_t buflen;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_FILE(pn);
 
 	if (!(pn->pn_flags & PFS_RD))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	vhold(vn);
 	locked = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	if (pn->pn_flags & PFS_RAWRD) {
 		PFS_TRACE(("%zd resid", uio->uio_resid));
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		PFS_TRACE(("%zd resid", uio->uio_resid));
 		goto ret;
 	}
 
 	if (uio->uio_resid < 0 || uio->uio_offset < 0 ||
 	    uio->uio_resid > OFF_MAX - uio->uio_offset) {
 		error = EINVAL;
 		goto ret;
 	}
 	buflen = uio->uio_offset + uio->uio_resid;
 	if (buflen > MAXPHYS)
 		buflen = MAXPHYS;
 
 	sb = sbuf_new(sb, NULL, buflen + 1, 0);
 	if (sb == NULL) {
 		error = EIO;
 		goto ret;
 	}
 
 	error = pn_fill(curthread, proc, pn, sb, uio);
 
 	if (error) {
 		sbuf_delete(sb);
 		goto ret;
 	}
 
 	/*
 	 * XXX: If the buffer overflowed, sbuf_len() will not return
 	 * the data length. Then just use the full length because an
 	 * overflowed sbuf must be full.
 	 */
 	if (sbuf_finish(sb) == 0)
 		buflen = sbuf_len(sb);
 	error = uiomove_frombuf(sbuf_data(sb), buflen, uio);
 	sbuf_delete(sb);
 ret:
 	vn_lock(vn, locked | LK_RETRY);
 	vdrop(vn);
 	if (proc != NULL)
 		PRELE(proc);
 	PFS_RETURN (error);
 }
 
 /*
  * Iterate through directory entries
  */
 static int
 pfs_iterate(struct thread *td, struct proc *proc, struct pfs_node *pd,
 	    struct pfs_node **pn, struct proc **p)
 {
 	int visible;
 
 	sx_assert(&allproc_lock, SX_SLOCKED);
 	pfs_assert_owned(pd);
  again:
 	if (*pn == NULL) {
 		/* first node */
 		*pn = pd->pn_nodes;
 	} else if ((*pn)->pn_type != pfstype_procdir) {
 		/* next node */
 		*pn = (*pn)->pn_next;
 	}
 	if (*pn != NULL && (*pn)->pn_type == pfstype_procdir) {
 		/* next process */
 		if (*p == NULL)
 			*p = LIST_FIRST(&allproc);
 		else
 			*p = LIST_NEXT(*p, p_list);
 		/* out of processes: next node */
 		if (*p == NULL)
 			*pn = (*pn)->pn_next;
 		else
 			PROC_LOCK(*p);
 	}
 
 	if ((*pn) == NULL)
 		return (-1);
 
 	if (*p != NULL) {
 		visible = pfs_visible_proc(td, *pn, *p);
 		PROC_UNLOCK(*p);
 	} else if (proc != NULL) {
 		visible = pfs_visible_proc(td, *pn, proc);
 	} else {
 		visible = 1;
 	}
 	if (!visible)
 		goto again;
 
 	return (0);
 }
 
 /* Directory entry list */
 struct pfsentry {
 	STAILQ_ENTRY(pfsentry)	link;
 	struct dirent		entry;
 };
 STAILQ_HEAD(pfsdirentlist, pfsentry);
 
 /*
  * Return directory entries.
  */
 static int
 pfs_readdir(struct vop_readdir_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pd = pvd->pvd_pn;
 	pid_t pid = pvd->pvd_pid;
 	struct proc *p, *proc;
 	struct pfs_node *pn;
 	struct uio *uio;
 	struct pfsentry *pfsent, *pfsent2;
 	struct pfsdirentlist lst;
 	off_t offset;
 	int error, i, resid;
 
 	STAILQ_INIT(&lst);
 	error = 0;
 	KASSERT(pd->pn_info == vn->v_mount->mnt_data,
 	    ("%s(): pn_info does not match mountpoint", __func__));
 	PFS_TRACE(("%s pid %lu", pd->pn_name, (unsigned long)pid));
 	pfs_assert_not_owned(pd);
 
 	if (vn->v_type != VDIR)
 		PFS_RETURN (ENOTDIR);
 	KASSERT_PN_IS_DIR(pd);
 	uio = va->a_uio;
 
 	/* only allow reading entire entries */
 	offset = uio->uio_offset;
 	resid = uio->uio_resid;
 	if (offset < 0 || offset % PFS_DELEN != 0 ||
 	    (resid && resid < PFS_DELEN))
 		PFS_RETURN (EINVAL);
 	if (resid == 0)
 		PFS_RETURN (0);
 
 	proc = NULL;
 	if (pid != NO_PID && !pfs_lookup_proc(pid, &proc))
 		PFS_RETURN (ENOENT);
 
 	sx_slock(&allproc_lock);
 	pfs_lock(pd);
 
 	KASSERT(pid == NO_PID || proc != NULL,
 	    ("%s(): no process for pid %lu", __func__, (unsigned long)pid));
 
 	if (pid != NO_PID) {
 		PROC_LOCK(proc);
 
 		/* check if the directory is visible to the caller */
 		if (!pfs_visible_proc(curthread, pd, proc)) {
 			_PRELE(proc);
 			PROC_UNLOCK(proc);
 			sx_sunlock(&allproc_lock);
 			pfs_unlock(pd);
 			PFS_RETURN (ENOENT);
 		}
 	}
 
 	/* skip unwanted entries */
 	for (pn = NULL, p = NULL; offset > 0; offset -= PFS_DELEN) {
 		if (pfs_iterate(curthread, proc, pd, &pn, &p) == -1) {
 			/* nothing left... */
 			if (proc != NULL) {
 				_PRELE(proc);
 				PROC_UNLOCK(proc);
 			}
 			pfs_unlock(pd);
 			sx_sunlock(&allproc_lock);
 			PFS_RETURN (0);
 		}
 	}
 
 	/* fill in entries */
 	while (pfs_iterate(curthread, proc, pd, &pn, &p) != -1 &&
 	    resid >= PFS_DELEN) {
 		if ((pfsent = malloc(sizeof(struct pfsentry), M_IOV,
 		    M_NOWAIT | M_ZERO)) == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		pfsent->entry.d_reclen = PFS_DELEN;
 		pfsent->entry.d_fileno = pn_fileno(pn, pid);
 		/* PFS_DELEN was picked to fit PFS_NAMLEN */
 		for (i = 0; i < PFS_NAMELEN - 1 && pn->pn_name[i] != '\0'; ++i)
 			pfsent->entry.d_name[i] = pn->pn_name[i];
 		pfsent->entry.d_namlen = i;
 		/* NOTE: d_off is the offset of the *next* entry. */
 		pfsent->entry.d_off = offset + PFS_DELEN;
 		switch (pn->pn_type) {
 		case pfstype_procdir:
 			KASSERT(p != NULL,
 			    ("reached procdir node with p == NULL"));
 			pfsent->entry.d_namlen = snprintf(pfsent->entry.d_name,
 			    PFS_NAMELEN, "%d", p->p_pid);
 			/* fall through */
 		case pfstype_root:
 		case pfstype_dir:
 		case pfstype_this:
 		case pfstype_parent:
 			pfsent->entry.d_type = DT_DIR;
 			break;
 		case pfstype_file:
 			pfsent->entry.d_type = DT_REG;
 			break;
 		case pfstype_symlink:
 			pfsent->entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->pn_name, pn->pn_type);
 		}
 		PFS_TRACE(("%s", pfsent->entry.d_name));
 		dirent_terminate(&pfsent->entry);
 		STAILQ_INSERT_TAIL(&lst, pfsent, link);
 		offset += PFS_DELEN;
 		resid -= PFS_DELEN;
 	}
 	if (proc != NULL) {
 		_PRELE(proc);
 		PROC_UNLOCK(proc);
 	}
 	pfs_unlock(pd);
 	sx_sunlock(&allproc_lock);
 	i = 0;
 	STAILQ_FOREACH_SAFE(pfsent, &lst, link, pfsent2) {
 		if (error == 0)
 			error = uiomove(&pfsent->entry, PFS_DELEN, uio);
 		free(pfsent, M_IOV);
 		i++;
 	}
 	PFS_TRACE(("%ju bytes", (uintmax_t)(i * PFS_DELEN)));
 	PFS_RETURN (error);
 }
 
 /*
  * Read a symbolic link
  */
 static int
 pfs_readlink(struct vop_readlink_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc = NULL;
 	char buf[PATH_MAX];
 	struct sbuf sb;
 	int error, locked;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VLNK)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_LINK(pn);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	if (pvd->pvd_pid != NO_PID) {
 		if ((proc = pfind(pvd->pvd_pid)) == NULL)
 			PFS_RETURN (EIO);
 		if (proc->p_flag & P_WEXIT) {
 			PROC_UNLOCK(proc);
 			PFS_RETURN (EIO);
 		}
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 	vhold(vn);
 	locked = VOP_ISLOCKED(vn);
 	VOP_UNLOCK(vn, 0);
 
 	/* sbuf_new() can't fail with a static buffer */
 	sbuf_new(&sb, buf, sizeof buf, 0);
 
 	error = pn_fill(curthread, proc, pn, &sb, NULL);
 
 	if (proc != NULL)
 		PRELE(proc);
 	vn_lock(vn, locked | LK_RETRY);
 	vdrop(vn);
 
 	if (error) {
 		sbuf_delete(&sb);
 		PFS_RETURN (error);
 	}
 
 	if (sbuf_finish(&sb) != 0) {
 		sbuf_delete(&sb);
 		PFS_RETURN (ENAMETOOLONG);
 	}
 
 	error = uiomove_frombuf(sbuf_data(&sb), sbuf_len(&sb), uio);
 	sbuf_delete(&sb);
 	PFS_RETURN (error);
 }
 
 /*
  * Reclaim a vnode
  */
 static int
 pfs_reclaim(struct vop_reclaim_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	return (pfs_vncache_free(va->a_vp));
 }
 
 /*
  * Set attributes
  */
 static int
 pfs_setattr(struct vop_setattr_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	/* Silently ignore unchangeable attributes. */
 	PFS_RETURN (0);
 }
 
 /*
  * Write to a file
  */
 static int
 pfs_write(struct vop_write_args *va)
 {
 	struct vnode *vn = va->a_vp;
 	struct pfs_vdata *pvd = vn->v_data;
 	struct pfs_node *pn = pvd->pvd_pn;
 	struct uio *uio = va->a_uio;
 	struct proc *proc;
 	struct sbuf sb;
 	int error;
 
 	PFS_TRACE(("%s", pn->pn_name));
 	pfs_assert_not_owned(pn);
 
 	if (vn->v_type != VREG)
 		PFS_RETURN (EINVAL);
 	KASSERT_PN_IS_FILE(pn);
 
 	if (!(pn->pn_flags & PFS_WR))
 		PFS_RETURN (EBADF);
 
 	if (pn->pn_fill == NULL)
 		PFS_RETURN (EIO);
 
 	/*
 	 * This is necessary because either process' privileges may
 	 * have changed since the open() call.
 	 */
 	if (!pfs_visible(curthread, pn, pvd->pvd_pid, &proc))
 		PFS_RETURN (EIO);
 	if (proc != NULL) {
 		_PHOLD(proc);
 		PROC_UNLOCK(proc);
 	}
 
 	if (pn->pn_flags & PFS_RAWWR) {
 		error = pn_fill(curthread, proc, pn, NULL, uio);
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	sbuf_uionew(&sb, uio, &error);
 	if (error) {
 		if (proc != NULL)
 			PRELE(proc);
 		PFS_RETURN (error);
 	}
 
 	error = pn_fill(curthread, proc, pn, &sb, uio);
 
 	sbuf_delete(&sb);
 	if (proc != NULL)
 		PRELE(proc);
 	PFS_RETURN (error);
 }
 
 /*
  * Vnode operations
  */
 struct vop_vector pfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		pfs_access,
 	.vop_cachedlookup =	pfs_lookup,
 	.vop_close =		pfs_close,
 	.vop_create =		VOP_EOPNOTSUPP,
 	.vop_getattr =		pfs_getattr,
 	.vop_getextattr =	pfs_getextattr,
 	.vop_ioctl =		pfs_ioctl,
 	.vop_link =		VOP_EOPNOTSUPP,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		VOP_EOPNOTSUPP,
 	.vop_mknod =		VOP_EOPNOTSUPP,
 	.vop_open =		pfs_open,
 	.vop_read =		pfs_read,
 	.vop_readdir =		pfs_readdir,
 	.vop_readlink =		pfs_readlink,
 	.vop_reclaim =		pfs_reclaim,
 	.vop_remove =		VOP_EOPNOTSUPP,
 	.vop_rename =		VOP_EOPNOTSUPP,
 	.vop_rmdir =		VOP_EOPNOTSUPP,
 	.vop_setattr =		pfs_setattr,
 	.vop_symlink =		VOP_EOPNOTSUPP,
 	.vop_vptocnp =		pfs_vptocnp,
 	.vop_write =		pfs_write,
 	/* XXX I've probably forgotten a few that need VOP_EOPNOTSUPP */
 };
+VFS_VOP_VECTOR_REGISTER(pfs_vnodeops);
Index: head/sys/fs/smbfs/smbfs_vnops.c
===================================================================
--- head/sys/fs/smbfs/smbfs_vnops.c	(revision 355789)
+++ head/sys/fs/smbfs/smbfs_vnops.c	(revision 355790)
@@ -1,1383 +1,1384 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/limits.h>
 #include <sys/lockf.h>
 #include <sys/stat.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 /*
  * Prototypes for SMBFS vnode operations
  */
 static vop_create_t	smbfs_create;
 static vop_mknod_t	smbfs_mknod;
 static vop_open_t	smbfs_open;
 static vop_close_t	smbfs_close;
 static vop_access_t	smbfs_access;
 static vop_getattr_t	smbfs_getattr;
 static vop_setattr_t	smbfs_setattr;
 static vop_read_t	smbfs_read;
 static vop_write_t	smbfs_write;
 static vop_fsync_t	smbfs_fsync;
 static vop_remove_t	smbfs_remove;
 static vop_link_t	smbfs_link;
 static vop_lookup_t	smbfs_lookup;
 static vop_rename_t	smbfs_rename;
 static vop_mkdir_t	smbfs_mkdir;
 static vop_rmdir_t	smbfs_rmdir;
 static vop_symlink_t	smbfs_symlink;
 static vop_readdir_t	smbfs_readdir;
 static vop_strategy_t	smbfs_strategy;
 static vop_print_t	smbfs_print;
 static vop_pathconf_t	smbfs_pathconf;
 static vop_advlock_t	smbfs_advlock;
 static vop_getextattr_t	smbfs_getextattr;
 
 struct vop_vector smbfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		smbfs_access,
 	.vop_advlock =		smbfs_advlock,
 	.vop_close =		smbfs_close,
 	.vop_create =		smbfs_create,
 	.vop_fsync =		smbfs_fsync,
 	.vop_getattr =		smbfs_getattr,
 	.vop_getextattr = 	smbfs_getextattr,
 	.vop_getpages =		smbfs_getpages,
 	.vop_inactive =		smbfs_inactive,
 	.vop_ioctl =		smbfs_ioctl,
 	.vop_link =		smbfs_link,
 	.vop_lookup =		smbfs_lookup,
 	.vop_mkdir =		smbfs_mkdir,
 	.vop_mknod =		smbfs_mknod,
 	.vop_open =		smbfs_open,
 	.vop_pathconf =		smbfs_pathconf,
 	.vop_print =		smbfs_print,
 	.vop_putpages =		smbfs_putpages,
 	.vop_read =		smbfs_read,
 	.vop_readdir =		smbfs_readdir,
 	.vop_reclaim =		smbfs_reclaim,
 	.vop_remove =		smbfs_remove,
 	.vop_rename =		smbfs_rename,
 	.vop_rmdir =		smbfs_rmdir,
 	.vop_setattr =		smbfs_setattr,
 /*	.vop_setextattr =	smbfs_setextattr,*/
 	.vop_strategy =		smbfs_strategy,
 	.vop_symlink =		smbfs_symlink,
 	.vop_write =		smbfs_write,
 };
+VFS_VOP_VECTOR_REGISTER(smbfs_vnodeops);
 
 static int
 smbfs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	accmode_t accmode = ap->a_accmode;
 	mode_t mpmode;
 	struct smbmount *smp = VTOSMBFS(vp);
 
 	SMBVDEBUG("\n");
 	if ((accmode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (vp->v_type) {
 		    case VREG: case VDIR: case VLNK:
 			return EROFS;
 		    default:
 			break;
 		}
 	}
 	mpmode = vp->v_type == VREG ? smp->sm_file_mode : smp->sm_dir_mode;
 	return (vaccess(vp->v_type, mpmode, smp->sm_uid,
 	    smp->sm_gid, ap->a_accmode, ap->a_cred, NULL));
 }
 
 /* ARGSUSED */
 static int
 smbfs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred *scred;
 	struct vattr vattr;
 	int mode = ap->a_mode;
 	int error, accmode;
 
 	SMBVDEBUG("%s,%d\n", np->n_name, (np->n_flag & NOPEN) != 0);
 	if (vp->v_type != VREG && vp->v_type != VDIR) { 
 		SMBFSERR("open eacces vtype=%d\n", vp->v_type);
 		return EACCES;
 	}
 	if (vp->v_type == VDIR) {
 		np->n_flag |= NOPEN;
 		return 0;
 	}
 	if (np->n_flag & NMODIFIED) {
 		if ((error = smbfs_vinvalbuf(vp, ap->a_td)) == EINTR)
 			return error;
 		smbfs_attr_cacheremove(vp);
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error)
 			return error;
 		np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 	} else {
 		error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 		if (error)
 			return error;
 		if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) {
 			error = smbfs_vinvalbuf(vp, ap->a_td);
 			if (error == EINTR)
 				return error;
 			np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 		}
 	}
 	if ((np->n_flag & NOPEN) != 0)
 		return 0;
 	/*
 	 * Use DENYNONE to give unixy semantics of permitting
 	 * everything not forbidden by permissions.  Ie denial
 	 * is up to server with clients/openers needing to use
 	 * advisory locks for further control.
 	 */
 	accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 		accmode = SMB_SM_DENYNONE|SMB_AM_OPENRW;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, ap->a_td, ap->a_cred);
 	error = smbfs_smb_open(np, accmode, scred);
 	if (error) {
 		if (mode & FWRITE)
 			return EACCES;
 		else if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			accmode = SMB_SM_DENYNONE|SMB_AM_OPENREAD;
 			error = smbfs_smb_open(np, accmode, scred);
 		}
 	}
 	if (error == 0) {
 		np->n_flag |= NOPEN;
 		vnode_create_vobject(ap->a_vp, vattr.va_size, ap->a_td);
 	}
 	smbfs_attr_cacheremove(vp);
 	smbfs_free_scred(scred);
 	return error;
 }
 
 static int
 smbfs_close(ap)
 	struct vop_close_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred *scred;
 
 	if (vp->v_type == VDIR && (np->n_flag & NOPEN) != 0 &&
 	    np->n_dirseq != NULL) {
 		scred = smbfs_malloc_scred();
 		smb_makescred(scred, td, ap->a_cred);
 		smbfs_findclose(np->n_dirseq, scred);
 		smbfs_free_scred(scred);
 		np->n_dirseq = NULL;
 	}
 	return 0;
 }
 
 /*
  * smbfs_getattr call from vfs.
  */
 static int
 smbfs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr *va=ap->a_vap;
 	struct smbfattr fattr;
 	struct smb_cred *scred;
 	u_quad_t oldsize;
 	int error;
 
 	SMBVDEBUG("%lx: '%s' %d\n", (long)vp, np->n_name, (vp->v_vflag & VV_ROOT) != 0);
 	error = smbfs_attr_cachelookup(vp, va);
 	if (!error)
 		return 0;
 	SMBVDEBUG("not in the cache\n");
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, curthread, ap->a_cred);
 	oldsize = np->n_size;
 	error = smbfs_smb_lookup(np, NULL, 0, &fattr, scred);
 	if (error) {
 		SMBVDEBUG("error %d\n", error);
 		smbfs_free_scred(scred);
 		return error;
 	}
 	smbfs_attr_cacheenter(vp, &fattr);
 	smbfs_attr_cachelookup(vp, va);
 	if (np->n_flag & NOPEN)
 		np->n_size = oldsize;
 	smbfs_free_scred(scred);
 	return 0;
 }
 
 static int
 smbfs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr *vap = ap->a_vap;
 	struct timespec *mtime, *atime;
 	struct smb_cred *scred;
 	struct smb_share *ssp = np->n_mount->sm_share;
 	struct smb_vc *vcp = SSTOVC(ssp);
 	struct thread *td = curthread;
 	u_quad_t tsize = 0;
 	int isreadonly, doclose, error = 0;
 	int old_n_dosattr;
 
 	SMBVDEBUG("\n");
 	isreadonly = (vp->v_mount->mnt_flag & MNT_RDONLY);
 	/*
 	 * Disallow write attempts if the filesystem is mounted read-only.
 	 */
   	if ((vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || 
 	     vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
 	     vap->va_mode != (mode_t)VNOVAL || vap->va_flags != VNOVAL) &&
 	     isreadonly)
 		return EROFS;
 
 	/*
 	 * We only support setting four flags.  Don't allow setting others.
 	 *
 	 * We map UF_READONLY to SMB_FA_RDONLY, unlike the MacOS X version
 	 * of this code, which maps both UF_IMMUTABLE AND SF_IMMUTABLE to
 	 * SMB_FA_RDONLY.  The immutable flags have different semantics
 	 * than readonly, which is the reason for the difference.
 	 */
 	if (vap->va_flags != VNOVAL) {
 		if (vap->va_flags & ~(UF_HIDDEN|UF_SYSTEM|UF_ARCHIVE|
 				      UF_READONLY))
 			return EINVAL;
 	}
 
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, ap->a_cred);
 	if (vap->va_size != VNOVAL) {
  		switch (vp->v_type) {
  		    case VDIR:
  			error = EISDIR;
 			goto out;
  		    case VREG:
 			break;
  		    default:
 			error = EINVAL;
 			goto out;
   		}
 		if (isreadonly) {
 			error = EROFS;
 			goto out;
 		}
 		doclose = 0;
 		vnode_pager_setsize(vp, (u_long)vap->va_size);
  		tsize = np->n_size;
  		np->n_size = vap->va_size;
 		if ((np->n_flag & NOPEN) == 0) {
 			error = smbfs_smb_open(np,
 					       SMB_SM_DENYNONE|SMB_AM_OPENRW,
 					       scred);
 			if (error == 0)
 				doclose = 1;
 		}
 		if (error == 0)
 			error = smbfs_smb_setfsize(np,
 			    (int64_t)vap->va_size, scred);
 		if (doclose)
 			smbfs_smb_close(ssp, np->n_fid, NULL, scred);
 		if (error) {
 			np->n_size = tsize;
 			vnode_pager_setsize(vp, (u_long)tsize);
 			goto out;
 		}
   	}
 	if ((vap->va_flags != VNOVAL) || (vap->va_mode != (mode_t)VNOVAL)) {
 		old_n_dosattr = np->n_dosattr;
 
 		if (vap->va_mode != (mode_t)VNOVAL) {
 			if (vap->va_mode & S_IWUSR)
 				np->n_dosattr &= ~SMB_FA_RDONLY;
 			else
 				np->n_dosattr |= SMB_FA_RDONLY;
 		}
 
 		if (vap->va_flags != VNOVAL) {
 			if (vap->va_flags & UF_HIDDEN)
 				np->n_dosattr |= SMB_FA_HIDDEN;
 			else
 				np->n_dosattr &= ~SMB_FA_HIDDEN;
 
 			if (vap->va_flags & UF_SYSTEM)
 				np->n_dosattr |= SMB_FA_SYSTEM;
 			else
 				np->n_dosattr &= ~SMB_FA_SYSTEM;
 
 			if (vap->va_flags & UF_ARCHIVE)
 				np->n_dosattr |= SMB_FA_ARCHIVE;
 			else
 				np->n_dosattr &= ~SMB_FA_ARCHIVE;
 
 			/*
 			 * We only support setting the immutable / readonly
 			 * bit for regular files.  According to comments in
 			 * the MacOS X version of this code, supporting the
 			 * readonly bit on directories doesn't do the same
 			 * thing in Windows as in Unix.
 			 */
 			if (vp->v_type == VREG) {
 				if (vap->va_flags & UF_READONLY)
 					np->n_dosattr |= SMB_FA_RDONLY;
 				else
 					np->n_dosattr &= ~SMB_FA_RDONLY;
 			}
 		}
 
 		if (np->n_dosattr != old_n_dosattr) {
 			error = smbfs_smb_setpattr(np, np->n_dosattr, NULL, scred);
 			if (error)
 				goto out;
 		}
 	}
 	mtime = atime = NULL;
 	if (vap->va_mtime.tv_sec != VNOVAL)
 		mtime = &vap->va_mtime;
 	if (vap->va_atime.tv_sec != VNOVAL)
 		atime = &vap->va_atime;
 	if (mtime != atime) {
 		if (vap->va_vaflags & VA_UTIMES_NULL) {
 			error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td);
 			if (error)
 				error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td);
 		} else
 			error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td);
 #if 0
 		if (mtime == NULL)
 			mtime = &np->n_mtime;
 		if (atime == NULL)
 			atime = &np->n_atime;
 #endif
 		/*
 		 * If file is opened, then we can use handle based calls.
 		 * If not, use path based ones.
 		 */
 		if ((np->n_flag & NOPEN) == 0) {
 			if (vcp->vc_flags & SMBV_WIN95) {
 				error = VOP_OPEN(vp, FWRITE, ap->a_cred, td,
 				    NULL);
 				if (!error) {
 /*					error = smbfs_smb_setfattrNT(np, 0,
 					    mtime, atime, scred);
 					VOP_GETATTR(vp, &vattr, ap->a_cred); */
 					if (mtime)
 						np->n_mtime = *mtime;
 					VOP_CLOSE(vp, FWRITE, ap->a_cred, td);
 				}
 			} else if ((vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS)) {
 				error = smbfs_smb_setptime2(np, mtime, atime, 0, scred);
 /*				error = smbfs_smb_setpattrNT(np, 0, mtime, atime, scred);*/
 			} else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN2_0) {
 				error = smbfs_smb_setptime2(np, mtime, atime, 0, scred);
 			} else {
 				error = smbfs_smb_setpattr(np, 0, mtime, scred);
 			}
 		} else {
 			if (vcp->vc_sopt.sv_caps & SMB_CAP_NT_SMBS) {
 				error = smbfs_smb_setfattrNT(np, 0, mtime, atime, scred);
 			} else if (SMB_DIALECT(vcp) >= SMB_DIALECT_LANMAN1_0) {
 				error = smbfs_smb_setftime(np, mtime, atime, scred);
 			} else {
 				/*
 				 * I have no idea how to handle this for core
 				 * level servers. The possible solution is to
 				 * update mtime after file is closed.
 				 */
 				 SMBERROR("can't update times on an opened file\n");
 			}
 		}
 	}
 	/*
 	 * Invalidate attribute cache in case if server doesn't set
 	 * required attributes.
 	 */
 	smbfs_attr_cacheremove(vp);	/* invalidate cache */
 	VOP_GETATTR(vp, vap, ap->a_cred);
 	np->n_mtime.tv_sec = vap->va_mtime.tv_sec;
 out:
 	smbfs_free_scred(scred);
 	return error;
 }
 /*
  * smbfs_read call.
  */
 static int
 smbfs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 
 	SMBVDEBUG("\n");
 	if (vp->v_type != VREG && vp->v_type != VDIR)
 		return EPERM;
 	return smbfs_readvnode(vp, uio, ap->a_cred);
 }
 
 static int
 smbfs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 
 	SMBVDEBUG("%d,ofs=%jd,sz=%zd\n",vp->v_type, (intmax_t)uio->uio_offset, 
 	    uio->uio_resid);
 	if (vp->v_type != VREG)
 		return (EPERM);
 	return smbfs_writevnode(vp, uio, ap->a_cred,ap->a_ioflag);
 }
 /*
  * smbfs_create call
  * Create a regular file. On entry the directory to contain the file being
  * created is locked.  We must release before we return. We must also free
  * the pathname buffer pointed at by cnp->cn_pnbuf, always on error, or
  * only if the SAVESTART bit in cn_flags is clear on success.
  */
 static int
 smbfs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp=ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct vnode *vp;
 	struct vattr vattr;
 	struct smbfattr fattr;
 	struct smb_cred *scred;
 	char *name = cnp->cn_nameptr;
 	int nmlen = cnp->cn_namelen;
 	int error;
 	
 
 	SMBVDEBUG("\n");
 	*vpp = NULL;
 	if (vap->va_type != VREG)
 		return EOPNOTSUPP;
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred)))
 		return error;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, cnp->cn_thread, cnp->cn_cred);
 	
 	error = smbfs_smb_create(dnp, name, nmlen, scred);
 	if (error)
 		goto out;
 	error = smbfs_smb_lookup(dnp, name, nmlen, &fattr, scred);
 	if (error)
 		goto out;
 	error = smbfs_nget(VTOVFS(dvp), dvp, name, nmlen, &fattr, &vp);
 	if (error)
 		goto out;
 	*vpp = vp;
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(dvp, vp, cnp);
 out:
 	smbfs_free_scred(scred);
 	return error;
 }
 
 static int
 smbfs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_dvp;
 		struct vnode * a_vp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 /*	struct vnode *dvp = ap->a_dvp;*/
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred *scred;
 	int error;
 
 	if (vp->v_type == VDIR || (np->n_flag & NOPEN) != 0 || vrefcnt(vp) != 1)
 		return EPERM;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_delete(np, scred);
 	if (error == 0)
 		np->n_flag |= NGONE;
 	cache_purge(vp);
 	smbfs_free_scred(scred);
 	return error;
 }
 
 /*
  * smbfs_file rename call
  */
 static int
 smbfs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 /*	struct componentname *fcnp = ap->a_fcnp;*/
 	struct smb_cred *scred;
 	u_int16_t flags = 6;
 	int error=0;
 
 	scred = NULL;
 	/* Check for cross-device rename */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		goto out;
 	}
 
 	if (tvp && vrefcnt(tvp) > 1) {
 		error = EBUSY;
 		goto out;
 	}
 	flags = 0x10;			/* verify all writes */
 	if (fvp->v_type == VDIR) {
 		flags |= 2;
 	} else if (fvp->v_type == VREG) {
 		flags |= 1;
 	} else {
 		return EINVAL;
 	}
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, tcnp->cn_thread, tcnp->cn_cred);
 	/*
 	 * It seems that Samba doesn't implement SMB_COM_MOVE call...
 	 */
 #ifdef notnow
 	if (SMB_DIALECT(SSTOCN(smp->sm_share)) >= SMB_DIALECT_LANMAN1_0) {
 		error = smbfs_smb_move(VTOSMB(fvp), VTOSMB(tdvp),
 		    tcnp->cn_nameptr, tcnp->cn_namelen, flags, scred);
 	} else
 #endif
 	{
 		/*
 		 * We have to do the work atomicaly
 		 */
 		if (tvp && tvp != fvp) {
 			error = smbfs_smb_delete(VTOSMB(tvp), scred);
 			if (error)
 				goto out_cacherem;
 			VTOSMB(fvp)->n_flag |= NGONE;
 		}
 		error = smbfs_smb_rename(VTOSMB(fvp), VTOSMB(tdvp),
 		    tcnp->cn_nameptr, tcnp->cn_namelen, scred);
 	}
 
 	if (fvp->v_type == VDIR) {
 		if (tvp != NULL && tvp->v_type == VDIR)
 			cache_purge(tdvp);
 		cache_purge(fdvp);
 	}
 
 out_cacherem:
 	smbfs_attr_cacheremove(fdvp);
 	smbfs_attr_cacheremove(tdvp);
 out:
 	smbfs_free_scred(scred);
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp)
 		vput(tvp);
 	vrele(fdvp);
 	vrele(fvp);
 #ifdef possible_mistake
 	vgone(fvp);
 	if (tvp)
 		vgone(tvp);
 #endif
 	return error;
 }
 
 /*
  * somtime it will come true...
  */
 static int
 smbfs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 /*
  * smbfs_symlink link create call.
  * Sometime it will be functional...
  */
 static int
 smbfs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 static int
 smbfs_mknod(ap) 
 	struct vop_mknod_args /* {
 	} */ *ap;
 {
 	return EOPNOTSUPP;
 }
 
 static int
 smbfs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 /*	struct vattr *vap = ap->a_vap;*/
 	struct vnode *vp;
 	struct componentname *cnp = ap->a_cnp;
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct vattr vattr;
 	struct smb_cred *scred;
 	struct smbfattr fattr;
 	char *name = cnp->cn_nameptr;
 	int len = cnp->cn_namelen;
 	int error;
 
 	if ((error = VOP_GETATTR(dvp, &vattr, cnp->cn_cred))) {
 		return error;
 	}	
 	if ((name[0] == '.') && ((len == 1) || ((len == 2) && (name[1] == '.'))))
 		return EEXIST;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_mkdir(dnp, name, len, scred);
 	if (error)
 		goto out;
 	error = smbfs_smb_lookup(dnp, name, len, &fattr, scred);
 	if (error)
 		goto out;
 	error = smbfs_nget(VTOVFS(dvp), dvp, name, len, &fattr, &vp);
 	if (error)
 		goto out;
 	*ap->a_vpp = vp;
 out:
 	smbfs_free_scred(scred);
 	return error;
 }
 
 /*
  * smbfs_remove directory call
  */
 static int
 smbfs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 /*	struct smbmount *smp = VTOSMBFS(vp);*/
 	struct smbnode *dnp = VTOSMB(dvp);
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred *scred;
 	int error;
 
 	if (dvp == vp)
 		return EINVAL;
 
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, cnp->cn_thread, cnp->cn_cred);
 	error = smbfs_smb_rmdir(np, scred);
 	if (error == 0)
 		np->n_flag |= NGONE;
 	dnp->n_flag |= NMODIFIED;
 	smbfs_attr_cacheremove(dvp);
 /*	cache_purge(dvp);*/
 	cache_purge(vp);
 	smbfs_free_scred(scred);
 	return error;
 }
 
 /*
  * smbfs_readdir call
  */
 static int
 smbfs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int error;
 
 	if (vp->v_type != VDIR)
 		return (EPERM);
 #ifdef notnow
 	if (ap->a_ncookies) {
 		printf("smbfs_readdir: no support for cookies now...");
 		return (EOPNOTSUPP);
 	}
 #endif
 	error = smbfs_readvnode(vp, uio, ap->a_cred);
 	return error;
 }
 
 /* ARGSUSED */
 static int
 smbfs_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode * a_vp;
 		struct ucred * a_cred;
 		int  a_waitfor;
 		struct thread * a_td;
 	} */ *ap;
 {
 /*	return (smb_flush(ap->a_vp, ap->a_cred, ap->a_waitfor, ap->a_td, 1));*/
     return (0);
 }
 
 static 
 int smbfs_print (ap) 
 	struct vop_print_args /* {
 	struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 
 	if (np == NULL) {
 		printf("no smbnode data\n");
 		return (0);
 	}
 	printf("\tname = %s, parent = %p, open = %d\n", np->n_name,
 	    np->n_parent ? np->n_parent : NULL, (np->n_flag & NOPEN) != 0);
 	return (0);
 }
 
 static int
 smbfs_pathconf (ap)
 	struct vop_pathconf_args  /* {
 	struct vnode *vp;
 	int name;
 	register_t *retval;
 	} */ *ap;
 {
 	struct smbmount *smp = VFSTOSMBFS(VTOVFS(ap->a_vp));
 	struct smb_vc *vcp = SSTOVC(smp->sm_share);
 	long *retval = ap->a_retval;
 	int error = 0;
 	
 	switch (ap->a_name) {
 	    case _PC_FILESIZEBITS:
 		if (vcp->vc_sopt.sv_caps & (SMB_CAP_LARGE_READX |
 		    SMB_CAP_LARGE_WRITEX))
 		    *retval = 64;
 		else
 		    *retval = 32;
 		break;
 	    case _PC_NAME_MAX:
 		*retval = (vcp->vc_hflags2 & SMB_FLAGS2_KNOWS_LONG_NAMES) ? 255 : 12;
 		break;
 	    case _PC_PATH_MAX:
 		*retval = 800;	/* XXX: a correct one ? */
 		break;
 	    case _PC_NO_TRUNC:
 		*retval = 1;
 		break;
 	    default:
 		error = vop_stdpathconf(ap);
 	}
 	return error;
 }
 
 static int
 smbfs_strategy (ap) 
 	struct vop_strategy_args /* {
 	struct buf *a_bp
 	} */ *ap;
 {
 	struct buf *bp=ap->a_bp;
 	struct ucred *cr;
 	struct thread *td;
 
 	SMBVDEBUG("\n");
 	if (bp->b_flags & B_ASYNC)
 		td = (struct thread *)0;
 	else
 		td = curthread;	/* XXX */
 	if (bp->b_iocmd == BIO_READ)
 		cr = bp->b_rcred;
 	else
 		cr = bp->b_wcred;
 
 	if ((bp->b_flags & B_ASYNC) == 0 )
 		(void)smbfs_doio(ap->a_vp, bp, cr, td);
 	return (0);
 }
 
 int
 smbfs_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		u_long a_command;
 		caddr_t a_data;
 		int fflag;
 		struct ucred *cred;
 		struct thread *td;
 	} */ *ap;
 {
 	return ENOTTY;
 }
 
 static char smbfs_atl[] = "rhsvda";
 static int
 smbfs_getextattr(struct vop_getextattr_args *ap)
 /* {
         IN struct vnode *a_vp;
         IN char *a_name;
         INOUT struct uio *a_uio;
         IN struct ucred *a_cred;
         IN struct thread *a_td;
 };
 */
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	struct uio *uio = ap->a_uio;
 	const char *name = ap->a_name;
 	struct smbnode *np = VTOSMB(vp);
 	struct vattr vattr;
 	char buf[10];
 	int i, attr, error;
 
 	error = VOP_ACCESS(vp, VREAD, cred, td);
 	if (error)
 		return error;
 	error = VOP_GETATTR(vp, &vattr, cred);
 	if (error)
 		return error;
 	if (strcmp(name, "dosattr") == 0) {
 		attr = np->n_dosattr;
 		for (i = 0; i < 6; i++, attr >>= 1)
 			buf[i] = (attr & 1) ? smbfs_atl[i] : '-';
 		buf[i] = 0;
 		error = uiomove(buf, i, uio);
 		
 	} else
 		error = EINVAL;
 	return error;
 }
 
 /*
  * Since we expected to support F_GETLK (and SMB protocol has no such function),
  * it is necessary to use lf_advlock(). It would be nice if this function had
  * a callback mechanism because it will help to improve a level of consistency.
  */
 int
 smbfs_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct smbnode *np = VTOSMB(vp);
 	struct flock *fl = ap->a_fl;
 	caddr_t id = (caddr_t)1 /* ap->a_id */;
 /*	int flags = ap->a_flags;*/
 	struct thread *td = curthread;
 	struct smb_cred *scred;
 	u_quad_t size;
 	off_t start, end, oadd;
 	int error, lkop;
 
 	if (vp->v_type == VDIR) {
 		/*
 		 * SMB protocol have no support for directory locking.
 		 * Although locks can be processed on local machine, I don't
 		 * think that this is a good idea, because some programs
 		 * can work wrong assuming directory is locked. So, we just
 		 * return 'operation not supported
 		 */
 		 return EOPNOTSUPP;
 	}
 	size = np->n_size;
 	switch (fl->l_whence) {
 
 	case SEEK_SET:
 	case SEEK_CUR:
 		start = fl->l_start;
 		break;
 
 	case SEEK_END:
 		if (size > OFF_MAX ||
 		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
 			return EOVERFLOW;
 		start = size + fl->l_start;
 		break;
 
 	default:
 		return EINVAL;
 	}
 	if (start < 0)
 		return EINVAL;
 	if (fl->l_len < 0) {
 		if (start == 0)
 			return EINVAL;
 		end = start - 1;
 		start += fl->l_len;
 		if (start < 0)
 			return EINVAL;
 	} else if (fl->l_len == 0)
 		end = -1;
 	else {
 		oadd = fl->l_len - 1;
 		if (oadd > OFF_MAX - start)
 			return EOVERFLOW;
 		end = start + oadd;
 	}
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, td->td_ucred);
 	switch (ap->a_op) {
 	    case F_SETLK:
 		switch (fl->l_type) {
 		    case F_WRLCK:
 			lkop = SMB_LOCK_EXCL;
 			break;
 		    case F_RDLCK:
 			lkop = SMB_LOCK_SHARED;
 			break;
 		    case F_UNLCK:
 			lkop = SMB_LOCK_RELEASE;
 			break;
 		    default:
 			smbfs_free_scred(scred);
 			return EINVAL;
 		}
 		error = lf_advlock(ap, &vp->v_lockf, size);
 		if (error)
 			break;
 		lkop = SMB_LOCK_EXCL;
 		error = smbfs_smb_lock(np, lkop, id, start, end, scred);
 		if (error) {
 			int oldtype = fl->l_type;
 			fl->l_type = F_UNLCK;
 			ap->a_op = F_UNLCK;
 			lf_advlock(ap, &vp->v_lockf, size);
 			fl->l_type = oldtype;
 		}
 		break;
 	    case F_UNLCK:
 		lf_advlock(ap, &vp->v_lockf, size);
 		error = smbfs_smb_lock(np, SMB_LOCK_RELEASE, id, start, end, scred);
 		break;
 	    case F_GETLK:
 		error = lf_advlock(ap, &vp->v_lockf, size);
 		break;
 	    default:
 		smbfs_free_scred(scred);
 		return EINVAL;
 	}
 	smbfs_free_scred(scred);
 	return error;
 }
 
 static int
 smbfs_pathcheck(struct smbmount *smp, const char *name, int nmlen, int nameiop)
 {
 	static const char *badchars = "*/:<>?";
 	static const char *badchars83 = " +|,[]=;";
 	const char *cp;
 	int i, error;
 
 	/*
 	 * Backslash characters, being a path delimiter, are prohibited
 	 * within a path component even for LOOKUP operations.
 	 */
 	if (strchr(name, '\\') != NULL)
 		return ENOENT;
 
 	if (nameiop == LOOKUP)
 		return 0;
 	error = ENOENT;
 	if (SMB_DIALECT(SSTOVC(smp->sm_share)) < SMB_DIALECT_LANMAN2_0) {
 		/*
 		 * Name should conform 8.3 format
 		 */
 		if (nmlen > 12)
 			return ENAMETOOLONG;
 		cp = strchr(name, '.');
 		if (cp == NULL)
 			return error;
 		if (cp == name || (cp - name) > 8)
 			return error;
 		cp = strchr(cp + 1, '.');
 		if (cp != NULL)
 			return error;
 		for (cp = name, i = 0; i < nmlen; i++, cp++)
 			if (strchr(badchars83, *cp) != NULL)
 				return error;
 	}
 	for (cp = name, i = 0; i < nmlen; i++, cp++)
 		if (strchr(badchars, *cp) != NULL)
 			return error;
 	return 0;
 }
 
 /*
  * Things go even weird without fixed inode numbers...
  */
 int
 smbfs_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnodeop_desc *a_desc;
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *vp;
 	struct smbmount *smp;
 	struct mount *mp = dvp->v_mount;
 	struct smbnode *dnp;
 	struct smbfattr fattr, *fap;
 	struct smb_cred *scred;
 	char *name = cnp->cn_nameptr;
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	int nmlen = cnp->cn_namelen;
 	int error, islastcn, isdot;
 	int killit;
 	
 	SMBVDEBUG("\n");
 	if (dvp->v_type != VDIR)
 		return ENOTDIR;
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) {
 		SMBFSERR("invalid '..'\n");
 		return EIO;
 	}
 	islastcn = flags & ISLASTCN;
 	if (islastcn && (mp->mnt_flag & MNT_RDONLY) && (nameiop != LOOKUP))
 		return EROFS;
 	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td)) != 0)
 		return error;
 	smp = VFSTOSMBFS(mp);
 	dnp = VTOSMB(dvp);
 	isdot = (nmlen == 1 && name[0] == '.');
 
 	error = smbfs_pathcheck(smp, cnp->cn_nameptr, cnp->cn_namelen, nameiop);
 
 	if (error) 
 		return ENOENT;
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	SMBVDEBUG("cache_lookup returned %d\n", error);
 	if (error > 0)
 		return error;
 	if (error) {		/* name was found */
 		struct vattr vattr;
 
 		killit = 0;
 		vp = *vpp;
 		error = VOP_GETATTR(vp, &vattr, cnp->cn_cred);
 		/*
 		 * If the file type on the server is inconsistent
 		 * with what it was when we created the vnode,
 		 * kill the bogus vnode now and fall through to
 		 * the code below to create a new one with the
 		 * right type.
 		 */
 		if (error == 0 &&
 		   ((vp->v_type == VDIR &&
 		   (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) == 0) ||
 		   (vp->v_type == VREG &&
 		   (VTOSMB(vp)->n_dosattr & SMB_FA_DIR) != 0)))
 		   killit = 1;
 		else if (error == 0
 	     /*    && vattr.va_ctime.tv_sec == VTOSMB(vp)->n_ctime*/) {
 		     if (nameiop != LOOKUP && islastcn)
 			     cnp->cn_flags |= SAVENAME;
 		     SMBVDEBUG("use cached vnode\n");
 		     return (0);
 		}
 		cache_purge(vp);
 		/*
 		 * XXX This is not quite right, if '.' is
 		 * inconsistent, we really need to start the lookup
 		 * all over again.  Hopefully there is some other
 		 * guarantee that prevents this case from happening.
 		 */
 		if (killit && vp != dvp)
 			vgone(vp);
 		if (vp != dvp)
 			vput(vp);
 		else
 			vrele(vp);
 		*vpp = NULLVP;
 	}
 	/* 
 	 * entry is not in the cache or has been expired
 	 */
 	error = 0;
 	*vpp = NULLVP;
 	scred = smbfs_malloc_scred();
 	smb_makescred(scred, td, cnp->cn_cred);
 	fap = &fattr;
 	if (flags & ISDOTDOT) {
 		/*
 		 * In the DOTDOT case, don't go over-the-wire
 		 * in order to request attributes. We already
 		 * know it's a directory and subsequent call to
 		 * smbfs_getattr() will restore consistency.
 		 *
 		 */
 		SMBVDEBUG("smbfs_smb_lookup: dotdot\n");
 	} else if (isdot) {
 		error = smbfs_smb_lookup(dnp, NULL, 0, fap, scred);
 		SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error);
 	}
 	else {
 		error = smbfs_smb_lookup(dnp, name, nmlen, fap, scred);
 		SMBVDEBUG("result of smbfs_smb_lookup: %d\n", error);
 	}
 	if (error && error != ENOENT)
 		goto out;
 	if (error) {			/* entry not found */
 		/*
 		 * Handle RENAME or CREATE case...
 		 */
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error)
 				goto out;
 			cnp->cn_flags |= SAVENAME;
 			error = EJUSTRETURN;
 			goto out;
 		}
 		error = ENOENT;
 		goto out;
 	}/* else {
 		SMBVDEBUG("Found entry %s with id=%d\n", fap->entryName, fap->dirEntNum);
 	}*/
 	/*
 	 * handle DELETE case ...
 	 */
 	if (nameiop == DELETE && islastcn) { 	/* delete last component */
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			goto out;
 		if (isdot) {
 			VREF(dvp);
 			*vpp = dvp;
 			goto out;
 		}
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			goto out;
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;
 		goto out;
 	}
 	if (nameiop == RENAME && islastcn) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			goto out;
 		if (isdot) {
 			error = EISDIR;
 			goto out;
 		}
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			goto out;
 		*vpp = vp;
 		cnp->cn_flags |= SAVENAME;
 		goto out;
 	}
 	if (flags & ISDOTDOT) {
 		mp = dvp->v_mount;
 		error = vfs_busy(mp, MBF_NOWAIT);
 		if (error != 0) {
 			vfs_ref(mp);
 			VOP_UNLOCK(dvp, 0);
 			error = vfs_busy(mp, 0);
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 			vfs_rel(mp);
 			if (error) {
 				error = ENOENT;
 				goto out;
 			}
 			if (VN_IS_DOOMED(dvp)) {
 				vfs_unbusy(mp);
 				error = ENOENT;
 				goto out;
 			}
 		}	
 		VOP_UNLOCK(dvp, 0);
 		error = smbfs_nget(mp, dvp, name, nmlen, NULL, &vp);
 		vfs_unbusy(mp);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		if (VN_IS_DOOMED(dvp)) {
 			if (error == 0)
 				vput(vp);
 			error = ENOENT;
 		}
 		if (error)
 			goto out;
 		*vpp = vp;
 	} else if (isdot) {
 		vref(dvp);
 		*vpp = dvp;
 	} else {
 		error = smbfs_nget(mp, dvp, name, nmlen, fap, &vp);
 		if (error)
 			goto out;
 		*vpp = vp;
 		SMBVDEBUG("lookup: getnewvp!\n");
 	}
 	if ((cnp->cn_flags & MAKEENTRY)/* && !islastcn*/) {
 /*		VTOSMB(*vpp)->n_ctime = VTOSMB(*vpp)->n_vattr.va_ctime.tv_sec;*/
 		cache_enter(dvp, *vpp, cnp);
 	}
 out:
 	smbfs_free_scred(scred);
 	return (error);
 }
Index: head/sys/fs/tmpfs/tmpfs_fifoops.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_fifoops.c	(revision 355789)
+++ head/sys/fs/tmpfs/tmpfs_fifoops.c	(revision 355790)
@@ -1,77 +1,78 @@
 /*	$NetBSD: tmpfs_fifoops.c,v 1.5 2005/12/11 12:24:29 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * tmpfs vnode interface for named pipes.
  */
 #include <sys/cdefs.h>
  __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 
 #include <fs/tmpfs/tmpfs.h>
 #include <fs/tmpfs/tmpfs_fifoops.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 
 static int
 tmpfs_fifo_close(struct vop_close_args *v)
 {
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(v->a_vp);
 	tmpfs_set_status(VFS_TO_TMPFS(v->a_vp->v_mount), node,
 	    TMPFS_NODE_ACCESSED);
 	tmpfs_update(v->a_vp);
 	return (fifo_specops.vop_close(v));
 }
 
 /*
  * vnode operations vector used for fifos stored in a tmpfs file system.
  */
 struct vop_vector tmpfs_fifoop_entries = {
 	.vop_default =			&fifo_specops,
 	.vop_close =			tmpfs_fifo_close,
 	.vop_reclaim =			tmpfs_reclaim,
 	.vop_access =			tmpfs_access,
 	.vop_getattr =			tmpfs_getattr,
 	.vop_setattr =			tmpfs_setattr,
 	.vop_pathconf =                 tmpfs_pathconf,
 	.vop_print =			tmpfs_print,
 };
+VFS_VOP_VECTOR_REGISTER(tmpfs_fifoop_entries);
Index: head/sys/fs/tmpfs/tmpfs_vnops.c
===================================================================
--- head/sys/fs/tmpfs/tmpfs_vnops.c	(revision 355789)
+++ head/sys/fs/tmpfs/tmpfs_vnops.c	(revision 355790)
@@ -1,1646 +1,1648 @@
 /*	$NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
  * 2005 program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * tmpfs vnode interface.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lockf.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 
 #include <fs/tmpfs/tmpfs_vnops.h>
 #include <fs/tmpfs/tmpfs.h>
 
 SYSCTL_DECL(_vfs_tmpfs);
 
 static volatile int tmpfs_rename_restarts;
 SYSCTL_INT(_vfs_tmpfs, OID_AUTO, rename_restarts, CTLFLAG_RD,
     __DEVOLATILE(int *, &tmpfs_rename_restarts), 0,
     "Times rename had to restart due to lock contention");
 
 static int
 tmpfs_vn_get_ino_alloc(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (tmpfs_alloc_vp(mp, arg, lkflags, rvp));
 }
 
 static int
 tmpfs_lookup1(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct tmpfs_dirent *de;
 	struct tmpfs_node *dnode, *pnode;
 	struct tmpfs_mount *tm;
 	int error;
 
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	*vpp = NULLVP;
 
 	/* Check accessibility of requested node as a first step. */
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error != 0)
 		goto out;
 
 	/* We cannot be requesting the parent directory of the root node. */
 	MPASS(IMPLIES(dnode->tn_type == VDIR &&
 	    dnode->tn_dir.tn_parent == dnode,
 	    !(cnp->cn_flags & ISDOTDOT)));
 
 	TMPFS_ASSERT_LOCKED(dnode);
 	if (dnode->tn_dir.tn_parent == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 	if (cnp->cn_flags & ISDOTDOT) {
 		tm = VFS_TO_TMPFS(dvp->v_mount);
 		pnode = dnode->tn_dir.tn_parent;
 		tmpfs_ref_node(pnode);
 		error = vn_vget_ino_gen(dvp, tmpfs_vn_get_ino_alloc,
 		    pnode, cnp->cn_lkflags, vpp);
 		tmpfs_free_node(tm, pnode);
 		if (error != 0)
 			goto out;
 	} else if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') {
 		VREF(dvp);
 		*vpp = dvp;
 		error = 0;
 	} else {
 		de = tmpfs_dir_lookup(dnode, NULL, cnp);
 		if (de != NULL && de->td_node == NULL)
 			cnp->cn_flags |= ISWHITEOUT;
 		if (de == NULL || de->td_node == NULL) {
 			/*
 			 * The entry was not found in the directory.
 			 * This is OK if we are creating or renaming an
 			 * entry and are working on the last component of
 			 * the path name.
 			 */
 			if ((cnp->cn_flags & ISLASTCN) &&
 			    (cnp->cn_nameiop == CREATE || \
 			    cnp->cn_nameiop == RENAME ||
 			    (cnp->cn_nameiop == DELETE &&
 			    cnp->cn_flags & DOWHITEOUT &&
 			    cnp->cn_flags & ISWHITEOUT))) {
 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
 				    cnp->cn_thread);
 				if (error != 0)
 					goto out;
 
 				/*
 				 * Keep the component name in the buffer for
 				 * future uses.
 				 */
 				cnp->cn_flags |= SAVENAME;
 
 				error = EJUSTRETURN;
 			} else
 				error = ENOENT;
 		} else {
 			struct tmpfs_node *tnode;
 
 			/*
 			 * The entry was found, so get its associated
 			 * tmpfs_node.
 			 */
 			tnode = de->td_node;
 
 			/*
 			 * If we are not at the last path component and
 			 * found a non-directory or non-link entry (which
 			 * may itself be pointing to a directory), raise
 			 * an error.
 			 */
 			if ((tnode->tn_type != VDIR &&
 			    tnode->tn_type != VLNK) &&
 			    !(cnp->cn_flags & ISLASTCN)) {
 				error = ENOTDIR;
 				goto out;
 			}
 
 			/*
 			 * If we are deleting or renaming the entry, keep
 			 * track of its tmpfs_dirent so that it can be
 			 * easily deleted later.
 			 */
 			if ((cnp->cn_flags & ISLASTCN) &&
 			    (cnp->cn_nameiop == DELETE ||
 			    cnp->cn_nameiop == RENAME)) {
 				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred,
 				    cnp->cn_thread);
 				if (error != 0)
 					goto out;
 
 				/* Allocate a new vnode on the matching entry. */
 				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
 				    cnp->cn_lkflags, vpp);
 				if (error != 0)
 					goto out;
 
 				if ((dnode->tn_mode & S_ISTXT) &&
 				  VOP_ACCESS(dvp, VADMIN, cnp->cn_cred,
 				  cnp->cn_thread) && VOP_ACCESS(*vpp, VADMIN,
 				  cnp->cn_cred, cnp->cn_thread)) {
 					error = EPERM;
 					vput(*vpp);
 					*vpp = NULL;
 					goto out;
 				}
 				cnp->cn_flags |= SAVENAME;
 			} else {
 				error = tmpfs_alloc_vp(dvp->v_mount, tnode,
 				    cnp->cn_lkflags, vpp);
 				if (error != 0)
 					goto out;
 			}
 		}
 	}
 
 	/*
 	 * Store the result of this lookup in the cache.  Avoid this if the
 	 * request was for creation, as it does not improve timings on
 	 * emprical tests.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp))
 		cache_enter(dvp, *vpp, cnp);
 
 out:
 	/*
 	 * If there were no errors, *vpp cannot be null and it must be
 	 * locked.
 	 */
 	MPASS(IFF(error == 0, *vpp != NULLVP && VOP_ISLOCKED(*vpp)));
 
 	return (error);
 }
 
 static int
 tmpfs_cached_lookup(struct vop_cachedlookup_args *v)
 {
 
 	return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp));
 }
 
 static int
 tmpfs_lookup(struct vop_lookup_args *v)
 {
 
 	return (tmpfs_lookup1(v->a_dvp, v->a_vpp, v->a_cnp));
 }
 
 static int
 tmpfs_create(struct vop_create_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 	int error;
 
 	MPASS(vap->va_type == VREG || vap->va_type == VSOCK);
 
 	error = tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0 && tmpfs_use_nc(dvp))
 		cache_enter(dvp, *vpp, cnp);
 	return (error);
 }
 
 static int
 tmpfs_mknod(struct vop_mknod_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 
 	if (vap->va_type != VBLK && vap->va_type != VCHR &&
 	    vap->va_type != VFIFO)
 		return EINVAL;
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 }
 
 static int
 tmpfs_open(struct vop_open_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	int mode = v->a_mode;
 
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* The file is still active but all its names have been removed
 	 * (e.g. by a "rmdir $(pwd)").  It cannot be opened any more as
 	 * it is about to die. */
 	if (node->tn_links < 1)
 		return (ENOENT);
 
 	/* If the file is marked append-only, deny write requests. */
 	if (node->tn_flags & APPEND && (mode & (FWRITE | O_APPEND)) == FWRITE)
 		error = EPERM;
 	else {
 		error = 0;
 		/* For regular files, the call below is nop. */
 		KASSERT(vp->v_type != VREG || (node->tn_reg.tn_aobj->flags &
 		    OBJ_DEAD) == 0, ("dead object"));
 		vnode_create_vobject(vp, node->tn_size, v->a_td);
 	}
 
 	MPASS(VOP_ISLOCKED(vp));
 	return error;
 }
 
 static int
 tmpfs_close(struct vop_close_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	/* Update node times. */
 	tmpfs_update(vp);
 
 	return (0);
 }
 
 int
 tmpfs_access(struct vop_access_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	accmode_t accmode = v->a_accmode;
 	struct ucred *cred = v->a_cred;
 	mode_t all_x = S_IXUSR | S_IXGRP | S_IXOTH;
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(vp));
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/*
 	 * Common case path lookup.
 	 */
 	if (__predict_true(accmode == VEXEC && (node->tn_mode & all_x) == all_x))
 		return (0);
 
 	switch (vp->v_type) {
 	case VDIR:
 		/* FALLTHROUGH */
 	case VLNK:
 		/* FALLTHROUGH */
 	case VREG:
 		if (accmode & VWRITE && vp->v_mount->mnt_flag & MNT_RDONLY) {
 			error = EROFS;
 			goto out;
 		}
 		break;
 
 	case VBLK:
 		/* FALLTHROUGH */
 	case VCHR:
 		/* FALLTHROUGH */
 	case VSOCK:
 		/* FALLTHROUGH */
 	case VFIFO:
 		break;
 
 	default:
 		error = EINVAL;
 		goto out;
 	}
 
 	if (accmode & VWRITE && node->tn_flags & IMMUTABLE) {
 		error = EPERM;
 		goto out;
 	}
 
 	error = vaccess(vp->v_type, node->tn_mode, node->tn_uid,
 	    node->tn_gid, accmode, cred, NULL);
 
 out:
 	MPASS(VOP_ISLOCKED(vp));
 
 	return error;
 }
 
 int
 tmpfs_getattr(struct vop_getattr_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct vattr *vap = v->a_vap;
 	vm_object_t obj;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	tmpfs_update(vp);
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = node->tn_mode;
 	vap->va_nlink = node->tn_links;
 	vap->va_uid = node->tn_uid;
 	vap->va_gid = node->tn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = node->tn_id;
 	vap->va_size = node->tn_size;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_atime = node->tn_atime;
 	vap->va_mtime = node->tn_mtime;
 	vap->va_ctime = node->tn_ctime;
 	vap->va_birthtime = node->tn_birthtime;
 	vap->va_gen = node->tn_gen;
 	vap->va_flags = node->tn_flags;
 	vap->va_rdev = (vp->v_type == VBLK || vp->v_type == VCHR) ?
 		node->tn_rdev : NODEV;
 	if (vp->v_type == VREG) {
 		obj = node->tn_reg.tn_aobj;
 		vap->va_bytes = (u_quad_t)obj->resident_page_count * PAGE_SIZE;
 	} else
 		vap->va_bytes = node->tn_size;
 	vap->va_filerev = 0;
 
 	return 0;
 }
 
 int
 tmpfs_setattr(struct vop_setattr_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct vattr *vap = v->a_vap;
 	struct ucred *cred = v->a_cred;
 	struct thread *td = curthread;
 
 	int error;
 
 	MPASS(VOP_ISLOCKED(vp));
 
 	error = 0;
 
 	/* Abort if any unsettable attribute is given. */
 	if (vap->va_type != VNON ||
 	    vap->va_nlink != VNOVAL ||
 	    vap->va_fsid != VNOVAL ||
 	    vap->va_fileid != VNOVAL ||
 	    vap->va_blocksize != VNOVAL ||
 	    vap->va_gen != VNOVAL ||
 	    vap->va_rdev != VNOVAL ||
 	    vap->va_bytes != VNOVAL)
 		error = EINVAL;
 
 	if (error == 0 && (vap->va_flags != VNOVAL))
 		error = tmpfs_chflags(vp, vap->va_flags, cred, td);
 
 	if (error == 0 && (vap->va_size != VNOVAL))
 		error = tmpfs_chsize(vp, vap->va_size, cred, td);
 
 	if (error == 0 && (vap->va_uid != VNOVAL || vap->va_gid != VNOVAL))
 		error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred, td);
 
 	if (error == 0 && (vap->va_mode != (mode_t)VNOVAL))
 		error = tmpfs_chmod(vp, vap->va_mode, cred, td);
 
 	if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL &&
 	    vap->va_atime.tv_nsec != VNOVAL) ||
 	    (vap->va_mtime.tv_sec != VNOVAL &&
 	    vap->va_mtime.tv_nsec != VNOVAL) ||
 	    (vap->va_birthtime.tv_sec != VNOVAL &&
 	    vap->va_birthtime.tv_nsec != VNOVAL)))
 		error = tmpfs_chtimes(vp, vap, cred, td);
 
 	/* Update the node times.  We give preference to the error codes
 	 * generated by this function rather than the ones that may arise
 	 * from tmpfs_update. */
 	tmpfs_update(vp);
 
 	MPASS(VOP_ISLOCKED(vp));
 
 	return error;
 }
 
 static int
 tmpfs_read(struct vop_read_args *v)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct tmpfs_node *node;
 
 	vp = v->a_vp;
 	if (vp->v_type != VREG)
 		return (EISDIR);
 	uio = v->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	node = VP_TO_TMPFS_NODE(vp);
 	tmpfs_set_status(VFS_TO_TMPFS(vp->v_mount), node, TMPFS_NODE_ACCESSED);
 	return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio));
 }
 
 static int
 tmpfs_write(struct vop_write_args *v)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct tmpfs_node *node;
 	off_t oldsize;
 	int error, ioflag;
 
 	vp = v->a_vp;
 	uio = v->a_uio;
 	ioflag = v->a_ioflag;
 	error = 0;
 	node = VP_TO_TMPFS_NODE(vp);
 	oldsize = node->tn_size;
 
 	if (uio->uio_offset < 0 || vp->v_type != VREG)
 		return (EINVAL);
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ioflag & IO_APPEND)
 		uio->uio_offset = node->tn_size;
 	if (uio->uio_offset + uio->uio_resid >
 	  VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize)
 		return (EFBIG);
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 	if (uio->uio_offset + uio->uio_resid > node->tn_size) {
 		error = tmpfs_reg_resize(vp, uio->uio_offset + uio->uio_resid,
 		    FALSE);
 		if (error != 0)
 			goto out;
 	}
 
 	error = uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio);
 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED |
 	    TMPFS_NODE_CHANGED;
 	if (node->tn_mode & (S_ISUID | S_ISGID)) {
 		if (priv_check_cred(v->a_cred, PRIV_VFS_RETAINSUGID))
 			node->tn_mode &= ~(S_ISUID | S_ISGID);
 	}
 	if (error != 0)
 		(void)tmpfs_reg_resize(vp, oldsize, TRUE);
 
 out:
 	MPASS(IMPLIES(error == 0, uio->uio_resid == 0));
 	MPASS(IMPLIES(error != 0, oldsize == node->tn_size));
 
 	return (error);
 }
 
 static int
 tmpfs_fsync(struct vop_fsync_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	MPASS(VOP_ISLOCKED(vp));
 
 	tmpfs_check_mtime(vp);
 	tmpfs_update(vp);
 
 	return 0;
 }
 
 static int
 tmpfs_remove(struct vop_remove_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode *vp = v->a_vp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp));
 	MPASS(VOP_ISLOCKED(vp));
 
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	node = VP_TO_TMPFS_NODE(vp);
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 	de = tmpfs_dir_lookup(dnode, node, v->a_cnp);
 	MPASS(de != NULL);
 
 	/* Files marked as immutable or append-only cannot be deleted. */
 	if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) ||
 	    (dnode->tn_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Remove the entry from the directory; as it is a file, we do not
 	 * have to change the number of hard links of the directory. */
 	tmpfs_dir_detach(dvp, de);
 	if (v->a_cnp->cn_flags & DOWHITEOUT)
 		tmpfs_dir_whiteout_add(dvp, v->a_cnp);
 
 	/* Free the directory entry we just deleted.  Note that the node
 	 * referred by it will not be removed until the vnode is really
 	 * reclaimed. */
 	tmpfs_free_dirent(tmp, de);
 
 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED;
 	error = 0;
 
 out:
 
 	return error;
 }
 
 static int
 tmpfs_link(struct vop_link_args *v)
 {
 	struct vnode *dvp = v->a_tdvp;
 	struct vnode *vp = v->a_vp;
 	struct componentname *cnp = v->a_cnp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp));
 	MPASS(cnp->cn_flags & HASBUF);
 	MPASS(dvp != vp); /* XXX When can this be false? */
 	node = VP_TO_TMPFS_NODE(vp);
 
 	/* Ensure that we do not overflow the maximum number of links imposed
 	 * by the system. */
 	MPASS(node->tn_links <= TMPFS_LINK_MAX);
 	if (node->tn_links == TMPFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 
 	/* We cannot create links of files marked immutable or append-only. */
 	if (node->tn_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* Allocate a new directory entry to represent the node. */
 	error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node,
 	    cnp->cn_nameptr, cnp->cn_namelen, &de);
 	if (error != 0)
 		goto out;
 
 	/* Insert the new directory entry into the appropriate directory. */
 	if (cnp->cn_flags & ISWHITEOUT)
 		tmpfs_dir_whiteout_remove(dvp, cnp);
 	tmpfs_dir_attach(dvp, de);
 
 	/* vp link count has changed, so update node times. */
 	node->tn_status |= TMPFS_NODE_CHANGED;
 	tmpfs_update(vp);
 
 	error = 0;
 
 out:
 	return error;
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 tmpfs_rename_relock(struct vnode *fdvp, struct vnode **fvpp,
     struct vnode *tdvp, struct vnode **tvpp,
     struct componentname *fcnp, struct componentname *tcnp)
 {
 	struct vnode *nvp;
 	struct mount *mp;
 	struct tmpfs_dirent *de;
 	int error, restarts = 0;
 
 	VOP_UNLOCK(tdvp, 0);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK(*tvpp, 0);
 	mp = fdvp->v_mount;
 
 relock:
 	restarts += 1;
 	error = vn_lock(fdvp, LK_EXCLUSIVE);
 	if (error)
 		goto releout;
 	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(fdvp, 0);
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto releout;
 		VOP_UNLOCK(tdvp, 0);
 		goto relock;
 	}
 	/*
 	 * Re-resolve fvp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(fdvp), NULL, fcnp);
 	if (de == NULL) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if ((fcnp->cn_flags & ISDOTDOT) != 0 ||
 		    (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.'))
 			error = EINVAL;
 		else
 			error = ENOENT;
 		goto releout;
 	}
 	error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 	if (error != 0) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (error != EBUSY)
 			goto releout;
 		error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE, &nvp);
 		if (error != 0)
 			goto releout;
 		VOP_UNLOCK(nvp, 0);
 		/*
 		 * Concurrent rename race.
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = EINVAL;
 			goto releout;
 		}
 		vrele(*fvpp);
 		*fvpp = nvp;
 		goto relock;
 	}
 	vrele(*fvpp);
 	*fvpp = nvp;
 	VOP_UNLOCK(*fvpp, 0);
 	/*
 	 * Re-resolve tvp and acquire the vnode lock if present.
 	 */
 	de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(tdvp), NULL, tcnp);
 	/*
 	 * If tvp disappeared we just carry on.
 	 */
 	if (de == NULL && *tvpp != NULL) {
 		vrele(*tvpp);
 		*tvpp = NULL;
 	}
 	/*
 	 * Get the tvp ino if the lookup succeeded.  We may have to restart
 	 * if the non-blocking acquire fails.
 	 */
 	if (de != NULL) {
 		nvp = NULL;
 		error = tmpfs_alloc_vp(mp, de->td_node,
 		    LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 		if (*tvpp != NULL)
 			vrele(*tvpp);
 		*tvpp = nvp;
 		if (error != 0) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			if (error != EBUSY)
 				goto releout;
 			error = tmpfs_alloc_vp(mp, de->td_node, LK_EXCLUSIVE,
 			    &nvp);
 			if (error != 0)
 				goto releout;
 			VOP_UNLOCK(nvp, 0);
 			/*
 			 * fdvp contains fvp, thus tvp (=fdvp) is not empty.
 			 */
 			if (nvp == fdvp) {
 				error = ENOTEMPTY;
 				goto releout;
 			}
 			goto relock;
 		}
 	}
 	tmpfs_rename_restarts += restarts;
 
 	return (0);
 
 releout:
 	vrele(fdvp);
 	vrele(*fvpp);
 	vrele(tdvp);
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	tmpfs_rename_restarts += restarts;
 
 	return (error);
 }
 
 static int
 tmpfs_rename(struct vop_rename_args *v)
 {
 	struct vnode *fdvp = v->a_fdvp;
 	struct vnode *fvp = v->a_fvp;
 	struct componentname *fcnp = v->a_fcnp;
 	struct vnode *tdvp = v->a_tdvp;
 	struct vnode *tvp = v->a_tvp;
 	struct componentname *tcnp = v->a_tcnp;
 	char *newname;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *fdnode;
 	struct tmpfs_node *fnode;
 	struct tmpfs_node *tnode;
 	struct tmpfs_node *tdnode;
 	int error;
 
 	MPASS(VOP_ISLOCKED(tdvp));
 	MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp)));
 	MPASS(fcnp->cn_flags & HASBUF);
 	MPASS(tcnp->cn_flags & HASBUF);
 
 	/*
 	 * Disallow cross-device renames.
 	 * XXX Why isn't this done by the caller?
 	 */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp != NULL && fvp->v_mount != tvp->v_mount)) {
 		error = EXDEV;
 		goto out;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if (fvp == tvp) {
 		error = 0;
 		goto out;
 	}
 
 	/*
 	 * If we need to move the directory between entries, lock the
 	 * source so that we can safely operate on it.
 	 */
 	if (fdvp != tdvp && fdvp != tvp) {
 		if (vn_lock(fdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 			error = tmpfs_rename_relock(fdvp, &fvp, tdvp, &tvp,
 			    fcnp, tcnp);
 			if (error != 0)
 				return (error);
 			ASSERT_VOP_ELOCKED(fdvp,
 			    "tmpfs_rename: fdvp not locked");
 			ASSERT_VOP_ELOCKED(tdvp,
 			    "tmpfs_rename: tdvp not locked");
 			if (tvp != NULL)
 				ASSERT_VOP_ELOCKED(tvp,
 				    "tmpfs_rename: tvp not locked");
 			if (fvp == tvp) {
 				error = 0;
 				goto out_locked;
 			}
 		}
 	}
 
 	tmp = VFS_TO_TMPFS(tdvp->v_mount);
 	tdnode = VP_TO_TMPFS_DIR(tdvp);
 	tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp);
 	fdnode = VP_TO_TMPFS_DIR(fdvp);
 	fnode = VP_TO_TMPFS_NODE(fvp);
 	de = tmpfs_dir_lookup(fdnode, fnode, fcnp);
 
 	/*
 	 * Entry can disappear before we lock fdvp,
 	 * also avoid manipulating '.' and '..' entries.
 	 */
 	if (de == NULL) {
 		if ((fcnp->cn_flags & ISDOTDOT) != 0 ||
 		    (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.'))
 			error = EINVAL;
 		else
 			error = ENOENT;
 		goto out_locked;
 	}
 	MPASS(de->td_node == fnode);
 
 	/*
 	 * If re-naming a directory to another preexisting directory
 	 * ensure that the target directory is empty so that its
 	 * removal causes no side effects.
 	 * Kern_rename guarantees the destination to be a directory
 	 * if the source is one.
 	 */
 	if (tvp != NULL) {
 		MPASS(tnode != NULL);
 
 		if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 		    (tdnode->tn_flags & (APPEND | IMMUTABLE))) {
 			error = EPERM;
 			goto out_locked;
 		}
 
 		if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) {
 			if (tnode->tn_size > 0) {
 				error = ENOTEMPTY;
 				goto out_locked;
 			}
 		} else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) {
 			error = ENOTDIR;
 			goto out_locked;
 		} else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) {
 			error = EISDIR;
 			goto out_locked;
 		} else {
 			MPASS(fnode->tn_type != VDIR &&
 				tnode->tn_type != VDIR);
 		}
 	}
 
 	if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (fdnode->tn_flags & (APPEND | IMMUTABLE))) {
 		error = EPERM;
 		goto out_locked;
 	}
 
 	/*
 	 * Ensure that we have enough memory to hold the new name, if it
 	 * has to be changed.
 	 */
 	if (fcnp->cn_namelen != tcnp->cn_namelen ||
 	    bcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen) != 0) {
 		newname = malloc(tcnp->cn_namelen, M_TMPFSNAME, M_WAITOK);
 	} else
 		newname = NULL;
 
 	/*
 	 * If the node is being moved to another directory, we have to do
 	 * the move.
 	 */
 	if (fdnode != tdnode) {
 		/*
 		 * In case we are moving a directory, we have to adjust its
 		 * parent to point to the new parent.
 		 */
 		if (de->td_node->tn_type == VDIR) {
 			struct tmpfs_node *n;
 
 			/*
 			 * Ensure the target directory is not a child of the
 			 * directory being moved.  Otherwise, we'd end up
 			 * with stale nodes.
 			 */
 			n = tdnode;
 			/*
 			 * TMPFS_LOCK guaranties that no nodes are freed while
 			 * traversing the list. Nodes can only be marked as
 			 * removed: tn_parent == NULL.
 			 */
 			TMPFS_LOCK(tmp);
 			TMPFS_NODE_LOCK(n);
 			while (n != n->tn_dir.tn_parent) {
 				struct tmpfs_node *parent;
 
 				if (n == fnode) {
 					TMPFS_NODE_UNLOCK(n);
 					TMPFS_UNLOCK(tmp);
 					error = EINVAL;
 					if (newname != NULL)
 						    free(newname, M_TMPFSNAME);
 					goto out_locked;
 				}
 				parent = n->tn_dir.tn_parent;
 				TMPFS_NODE_UNLOCK(n);
 				if (parent == NULL) {
 					n = NULL;
 					break;
 				}
 				TMPFS_NODE_LOCK(parent);
 				if (parent->tn_dir.tn_parent == NULL) {
 					TMPFS_NODE_UNLOCK(parent);
 					n = NULL;
 					break;
 				}
 				n = parent;
 			}
 			TMPFS_UNLOCK(tmp);
 			if (n == NULL) {
 				error = EINVAL;
 				if (newname != NULL)
 					    free(newname, M_TMPFSNAME);
 				goto out_locked;
 			}
 			TMPFS_NODE_UNLOCK(n);
 
 			/* Adjust the parent pointer. */
 			TMPFS_VALIDATE_DIR(fnode);
 			TMPFS_NODE_LOCK(de->td_node);
 			de->td_node->tn_dir.tn_parent = tdnode;
 			TMPFS_NODE_UNLOCK(de->td_node);
 
 			/*
 			 * As a result of changing the target of the '..'
 			 * entry, the link count of the source and target
 			 * directories has to be adjusted.
 			 */
 			TMPFS_NODE_LOCK(tdnode);
 			TMPFS_ASSERT_LOCKED(tdnode);
 			tdnode->tn_links++;
 			TMPFS_NODE_UNLOCK(tdnode);
 
 			TMPFS_NODE_LOCK(fdnode);
 			TMPFS_ASSERT_LOCKED(fdnode);
 			fdnode->tn_links--;
 			TMPFS_NODE_UNLOCK(fdnode);
 		}
 	}
 
 	/*
 	 * Do the move: just remove the entry from the source directory
 	 * and insert it into the target one.
 	 */
 	tmpfs_dir_detach(fdvp, de);
 
 	if (fcnp->cn_flags & DOWHITEOUT)
 		tmpfs_dir_whiteout_add(fdvp, fcnp);
 	if (tcnp->cn_flags & ISWHITEOUT)
 		tmpfs_dir_whiteout_remove(tdvp, tcnp);
 
 	/*
 	 * If the name has changed, we need to make it effective by changing
 	 * it in the directory entry.
 	 */
 	if (newname != NULL) {
 		MPASS(tcnp->cn_namelen <= MAXNAMLEN);
 
 		free(de->ud.td_name, M_TMPFSNAME);
 		de->ud.td_name = newname;
 		tmpfs_dirent_init(de, tcnp->cn_nameptr, tcnp->cn_namelen);
 
 		fnode->tn_status |= TMPFS_NODE_CHANGED;
 		tdnode->tn_status |= TMPFS_NODE_MODIFIED;
 	}
 
 	/*
 	 * If we are overwriting an entry, we have to remove the old one
 	 * from the target directory.
 	 */
 	if (tvp != NULL) {
 		struct tmpfs_dirent *tde;
 
 		/* Remove the old entry from the target directory. */
 		tde = tmpfs_dir_lookup(tdnode, tnode, tcnp);
 		tmpfs_dir_detach(tdvp, tde);
 
 		/*
 		 * Free the directory entry we just deleted.  Note that the
 		 * node referred by it will not be removed until the vnode is
 		 * really reclaimed.
 		 */
 		tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), tde);
 	}
 
 	tmpfs_dir_attach(tdvp, de);
 
 	if (tmpfs_use_nc(fvp)) {
 		cache_purge(fvp);
 		if (tvp != NULL)
 			cache_purge(tvp);
 		cache_purge_negative(tdvp);
 	}
 
 	error = 0;
 
 out_locked:
 	if (fdvp != tdvp && fdvp != tvp)
 		VOP_UNLOCK(fdvp, 0);
 
 out:
 	/*
 	 * Release target nodes.
 	 * XXX: I don't understand when tdvp can be the same as tvp, but
 	 * other code takes care of this...
 	 */
 	if (tdvp == tvp)
 		vrele(tdvp);
 	else
 		vput(tdvp);
 	if (tvp != NULL)
 		vput(tvp);
 
 	/* Release source nodes. */
 	vrele(fdvp);
 	vrele(fvp);
 
 	return (error);
 }
 
 static int
 tmpfs_mkdir(struct vop_mkdir_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 
 	MPASS(vap->va_type == VDIR);
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, NULL);
 }
 
 static int
 tmpfs_rmdir(struct vop_rmdir_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode *vp = v->a_vp;
 
 	int error;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *dnode;
 	struct tmpfs_node *node;
 
 	MPASS(VOP_ISLOCKED(dvp));
 	MPASS(VOP_ISLOCKED(vp));
 
 	tmp = VFS_TO_TMPFS(dvp->v_mount);
 	dnode = VP_TO_TMPFS_DIR(dvp);
 	node = VP_TO_TMPFS_DIR(vp);
 
 	/* Directories with more than two entries ('.' and '..') cannot be
 	 * removed. */
 	 if (node->tn_size > 0) {
 		 error = ENOTEMPTY;
 		 goto out;
 	 }
 
 	if ((dnode->tn_flags & APPEND)
 	    || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 
 	/* This invariant holds only if we are not trying to remove "..".
 	  * We checked for that above so this is safe now. */
 	MPASS(node->tn_dir.tn_parent == dnode);
 
 	/* Get the directory entry associated with node (vp).  This was
 	 * filled by tmpfs_lookup while looking up the entry. */
 	de = tmpfs_dir_lookup(dnode, node, v->a_cnp);
 	MPASS(TMPFS_DIRENT_MATCHES(de,
 	    v->a_cnp->cn_nameptr,
 	    v->a_cnp->cn_namelen));
 
 	/* Check flags to see if we are allowed to remove the directory. */
 	if ((dnode->tn_flags & APPEND) != 0 ||
 	    (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) != 0) {
 		error = EPERM;
 		goto out;
 	}
 
 
 	/* Detach the directory entry from the directory (dnode). */
 	tmpfs_dir_detach(dvp, de);
 	if (v->a_cnp->cn_flags & DOWHITEOUT)
 		tmpfs_dir_whiteout_add(dvp, v->a_cnp);
 
 	/* No vnode should be allocated for this entry from this point */
 	TMPFS_NODE_LOCK(node);
 	node->tn_links--;
 	node->tn_dir.tn_parent = NULL;
 	node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
 	    TMPFS_NODE_MODIFIED;
 
 	TMPFS_NODE_UNLOCK(node);
 
 	TMPFS_NODE_LOCK(dnode);
 	dnode->tn_links--;
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED |
 	    TMPFS_NODE_MODIFIED;
 	TMPFS_NODE_UNLOCK(dnode);
 
 	if (tmpfs_use_nc(dvp)) {
 		cache_purge(dvp);
 		cache_purge(vp);
 	}
 
 	/* Free the directory entry we just deleted.  Note that the node
 	 * referred by it will not be removed until the vnode is really
 	 * reclaimed. */
 	tmpfs_free_dirent(tmp, de);
 
 	/* Release the deleted vnode (will destroy the node, notify
 	 * interested parties and clean it from the cache). */
 
 	dnode->tn_status |= TMPFS_NODE_CHANGED;
 	tmpfs_update(dvp);
 
 	error = 0;
 
 out:
 	return error;
 }
 
 static int
 tmpfs_symlink(struct vop_symlink_args *v)
 {
 	struct vnode *dvp = v->a_dvp;
 	struct vnode **vpp = v->a_vpp;
 	struct componentname *cnp = v->a_cnp;
 	struct vattr *vap = v->a_vap;
 	const char *target = v->a_target;
 
 #ifdef notyet /* XXX FreeBSD BUG: kern_symlink is not setting VLNK */
 	MPASS(vap->va_type == VLNK);
 #else
 	vap->va_type = VLNK;
 #endif
 
 	return tmpfs_alloc_file(dvp, vpp, vap, cnp, target);
 }
 
 static int
 tmpfs_readdir(struct vop_readdir_args *va)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct tmpfs_mount *tm;
 	struct tmpfs_node *node;
 	u_long **cookies;
 	int *eofflag, *ncookies;
 	ssize_t startresid;
 	int error, maxcookies;
 
 	vp = va->a_vp;
 	uio = va->a_uio;
 	eofflag = va->a_eofflag;
 	cookies = va->a_cookies;
 	ncookies = va->a_ncookies;
 
 	/* This operation only makes sense on directory nodes. */
 	if (vp->v_type != VDIR)
 		return ENOTDIR;
 
 	maxcookies = 0;
 	node = VP_TO_TMPFS_DIR(vp);
 	tm = VFS_TO_TMPFS(vp->v_mount);
 
 	startresid = uio->uio_resid;
 
 	/* Allocate cookies for NFS and compat modules. */
 	if (cookies != NULL && ncookies != NULL) {
 		maxcookies = howmany(node->tn_size,
 		    sizeof(struct tmpfs_dirent)) + 2;
 		*cookies = malloc(maxcookies * sizeof(**cookies), M_TEMP,
 		    M_WAITOK);
 		*ncookies = 0;
 	}
 
 	if (cookies == NULL)
 		error = tmpfs_dir_getdents(tm, node, uio, 0, NULL, NULL);
 	else
 		error = tmpfs_dir_getdents(tm, node, uio, maxcookies, *cookies,
 		    ncookies);
 
 	/* Buffer was filled without hitting EOF. */
 	if (error == EJUSTRETURN)
 		error = (uio->uio_resid != startresid) ? 0 : EINVAL;
 
 	if (error != 0 && cookies != NULL && ncookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 
 	if (eofflag != NULL)
 		*eofflag =
 		    (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF);
 
 	return error;
 }
 
 static int
 tmpfs_readlink(struct vop_readlink_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	struct uio *uio = v->a_uio;
 
 	int error;
 	struct tmpfs_node *node;
 
 	MPASS(uio->uio_offset == 0);
 	MPASS(vp->v_type == VLNK);
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	error = uiomove(node->tn_link, MIN(node->tn_size, uio->uio_resid),
 	    uio);
 	tmpfs_set_status(VFS_TO_TMPFS(vp->v_mount), node, TMPFS_NODE_ACCESSED);
 
 	return (error);
 }
 
 static int
 tmpfs_inactive(struct vop_inactive_args *v)
 {
 	struct vnode *vp;
 	struct tmpfs_node *node;
 
 	vp = v->a_vp;
 	node = VP_TO_TMPFS_NODE(vp);
 	if (node->tn_links == 0)
 		vrecycle(vp);
 	else
 		tmpfs_check_mtime(vp);
 	return (0);
 }
 
 static int
 tmpfs_need_inactive(struct vop_need_inactive_args *ap)
 {
 	struct vnode *vp;
 	struct tmpfs_node *node;
 	struct vm_object *obj;
 
 	vp = ap->a_vp;
 	node = VP_TO_TMPFS_NODE(vp);
 	if (node->tn_links == 0)
 		goto need;
 	if (vp->v_type == VREG) {
 		obj = vp->v_object;
 		if (obj->generation != obj->cleangeneration)
 			goto need;
 	}
 	return (0);
 need:
 	return (1);
 }
 
 int
 tmpfs_reclaim(struct vop_reclaim_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_mount *tmp;
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 	tmp = VFS_TO_TMPFS(vp->v_mount);
 
 	if (vp->v_type == VREG)
 		tmpfs_destroy_vobject(vp, node->tn_reg.tn_aobj);
 	vp->v_object = NULL;
 	if (tmpfs_use_nc(vp))
 		cache_purge(vp);
 
 	TMPFS_NODE_LOCK(node);
 	tmpfs_free_vp(vp);
 
 	/* If the node referenced by this vnode was deleted by the user,
 	 * we must free its associated data structures (now that the vnode
 	 * is being reclaimed). */
 	if (node->tn_links == 0 &&
 	    (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) {
 		node->tn_vpstate = TMPFS_VNODE_DOOMED;
 		TMPFS_NODE_UNLOCK(node);
 		tmpfs_free_node(tmp, node);
 	} else
 		TMPFS_NODE_UNLOCK(node);
 
 	MPASS(vp->v_data == NULL);
 	return 0;
 }
 
 int
 tmpfs_print(struct vop_print_args *v)
 {
 	struct vnode *vp = v->a_vp;
 
 	struct tmpfs_node *node;
 
 	node = VP_TO_TMPFS_NODE(vp);
 
 	printf("tag VT_TMPFS, tmpfs_node %p, flags 0x%lx, links %jd\n",
 	    node, node->tn_flags, (uintmax_t)node->tn_links);
 	printf("\tmode 0%o, owner %d, group %d, size %jd, status 0x%x\n",
 	    node->tn_mode, node->tn_uid, node->tn_gid,
 	    (intmax_t)node->tn_size, node->tn_status);
 
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 
 	printf("\n");
 
 	return 0;
 }
 
 int
 tmpfs_pathconf(struct vop_pathconf_args *v)
 {
 	struct vnode *vp = v->a_vp;
 	int name = v->a_name;
 	long *retval = v->a_retval;
 
 	int error;
 
 	error = 0;
 
 	switch (name) {
 	case _PC_LINK_MAX:
 		*retval = TMPFS_LINK_MAX;
 		break;
 
 	case _PC_NAME_MAX:
 		*retval = NAME_MAX;
 		break;
 
 	case _PC_PIPE_BUF:
 		if (vp->v_type == VDIR || vp->v_type == VFIFO)
 			*retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 
 	case _PC_CHOWN_RESTRICTED:
 		*retval = 1;
 		break;
 
 	case _PC_NO_TRUNC:
 		*retval = 1;
 		break;
 
 	case _PC_SYNC_IO:
 		*retval = 1;
 		break;
 
 	case _PC_FILESIZEBITS:
 		*retval = 64;
 		break;
 
 	default:
 		error = vop_stdpathconf(v);
 	}
 
 	return error;
 }
 
 static int
 tmpfs_vptofh(struct vop_vptofh_args *ap)
 {
 	struct tmpfs_fid *tfhp;
 	struct tmpfs_node *node;
 
 	tfhp = (struct tmpfs_fid *)ap->a_fhp;
 	node = VP_TO_TMPFS_NODE(ap->a_vp);
 
 	tfhp->tf_len = sizeof(struct tmpfs_fid);
 	tfhp->tf_id = node->tn_id;
 	tfhp->tf_gen = node->tn_gen;
 
 	return (0);
 }
 
 static int
 tmpfs_whiteout(struct vop_whiteout_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct tmpfs_dirent *de;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		return (0);
 	case CREATE:
 		de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp);
 		if (de != NULL)
 			return (de->td_node == NULL ? 0 : EEXIST);
 		return (tmpfs_dir_whiteout_add(dvp, cnp));
 	case DELETE:
 		tmpfs_dir_whiteout_remove(dvp, cnp);
 		return (0);
 	default:
 		panic("tmpfs_whiteout: unknown op");
 	}
 }
 
 static int
 tmpfs_vptocnp_dir(struct tmpfs_node *tn, struct tmpfs_node *tnp,
     struct tmpfs_dirent **pde)
 {
 	struct tmpfs_dir_cursor dc;
 	struct tmpfs_dirent *de;
 
 	for (de = tmpfs_dir_first(tnp, &dc); de != NULL;
 	     de = tmpfs_dir_next(tnp, &dc)) {
 		if (de->td_node == tn) {
 			*pde = de;
 			return (0);
 		}
 	}
 	return (ENOENT);
 }
 
 static int
 tmpfs_vptocnp_fill(struct vnode *vp, struct tmpfs_node *tn,
     struct tmpfs_node *tnp, char *buf, int *buflen, struct vnode **dvp)
 {
 	struct tmpfs_dirent *de;
 	int error, i;
 
 	error = vn_vget_ino_gen(vp, tmpfs_vn_get_ino_alloc, tnp, LK_SHARED,
 	    dvp);
 	if (error != 0)
 		return (error);
 	error = tmpfs_vptocnp_dir(tn, tnp, &de);
 	if (error == 0) {
 		i = *buflen;
 		i -= de->td_namelen;
 		if (i < 0) {
 			error = ENOMEM;
 		} else {
 			bcopy(de->ud.td_name, buf + i, de->td_namelen);
 			*buflen = i;
 		}
 	}
 	if (error == 0) {
 		if (vp != *dvp)
 			VOP_UNLOCK(*dvp, 0);
 	} else {
 		if (vp != *dvp)
 			vput(*dvp);
 		else
 			vrele(vp);
 	}
 	return (error);
 }
 
 static int
 tmpfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp, **dvp;
 	struct tmpfs_node *tn, *tnp, *tnp1;
 	struct tmpfs_dirent *de;
 	struct tmpfs_mount *tm;
 	char *buf;
 	int *buflen;
 	int error;
 
 	vp = ap->a_vp;
 	dvp = ap->a_vpp;
 	buf = ap->a_buf;
 	buflen = ap->a_buflen;
 
 	tm = VFS_TO_TMPFS(vp->v_mount);
 	tn = VP_TO_TMPFS_NODE(vp);
 	if (tn->tn_type == VDIR) {
 		tnp = tn->tn_dir.tn_parent;
 		if (tnp == NULL)
 			return (ENOENT);
 		tmpfs_ref_node(tnp);
 		error = tmpfs_vptocnp_fill(vp, tn, tn->tn_dir.tn_parent, buf,
 		    buflen, dvp);
 		tmpfs_free_node(tm, tnp);
 		return (error);
 	}
 restart:
 	TMPFS_LOCK(tm);
 	LIST_FOREACH_SAFE(tnp, &tm->tm_nodes_used, tn_entries, tnp1) {
 		if (tnp->tn_type != VDIR)
 			continue;
 		TMPFS_NODE_LOCK(tnp);
 		tmpfs_ref_node_locked(tnp);
 
 		/*
 		 * tn_vnode cannot be instantiated while we hold the
 		 * node lock, so the directory cannot be changed while
 		 * we iterate over it.  Do this to avoid instantiating
 		 * vnode for directories which cannot point to our
 		 * node.
 		 */
 		error = tnp->tn_vnode == NULL ? tmpfs_vptocnp_dir(tn, tnp,
 		    &de) : 0;
 
 		if (error == 0) {
 			TMPFS_NODE_UNLOCK(tnp);
 			TMPFS_UNLOCK(tm);
 			error = tmpfs_vptocnp_fill(vp, tn, tnp, buf, buflen,
 			    dvp);
 			if (error == 0) {
 				tmpfs_free_node(tm, tnp);
 				return (0);
 			}
 			if (VN_IS_DOOMED(vp)) {
 				tmpfs_free_node(tm, tnp);
 				return (ENOENT);
 			}
 			TMPFS_LOCK(tm);
 			TMPFS_NODE_LOCK(tnp);
 		}
 		if (tmpfs_free_node_locked(tm, tnp, false)) {
 			goto restart;
 		} else {
 			KASSERT(tnp->tn_refcount > 0,
 			    ("node %p refcount zero", tnp));
 			tnp1 = LIST_NEXT(tnp, tn_entries);
 			TMPFS_NODE_UNLOCK(tnp);
 		}
 	}
 	TMPFS_UNLOCK(tm);
 	return (ENOENT);
 }
 
 /*
  * Vnode operations vector used for files stored in a tmpfs file system.
  */
 struct vop_vector tmpfs_vnodeop_entries = {
 	.vop_default =			&default_vnodeops,
 	.vop_lookup =			vfs_cache_lookup,
 	.vop_cachedlookup =		tmpfs_cached_lookup,
 	.vop_create =			tmpfs_create,
 	.vop_mknod =			tmpfs_mknod,
 	.vop_open =			tmpfs_open,
 	.vop_close =			tmpfs_close,
 	.vop_access =			tmpfs_access,
 	.vop_getattr =			tmpfs_getattr,
 	.vop_setattr =			tmpfs_setattr,
 	.vop_read =			tmpfs_read,
 	.vop_write =			tmpfs_write,
 	.vop_fsync =			tmpfs_fsync,
 	.vop_remove =			tmpfs_remove,
 	.vop_link =			tmpfs_link,
 	.vop_rename =			tmpfs_rename,
 	.vop_mkdir =			tmpfs_mkdir,
 	.vop_rmdir =			tmpfs_rmdir,
 	.vop_symlink =			tmpfs_symlink,
 	.vop_readdir =			tmpfs_readdir,
 	.vop_readlink =			tmpfs_readlink,
 	.vop_inactive =			tmpfs_inactive,
 	.vop_need_inactive =		tmpfs_need_inactive,
 	.vop_reclaim =			tmpfs_reclaim,
 	.vop_print =			tmpfs_print,
 	.vop_pathconf =			tmpfs_pathconf,
 	.vop_vptofh =			tmpfs_vptofh,
 	.vop_whiteout =			tmpfs_whiteout,
 	.vop_bmap =			VOP_EOPNOTSUPP,
 	.vop_vptocnp =			tmpfs_vptocnp,
 	.vop_lock1 =			vop_lock,
 	.vop_unlock = 			vop_unlock,
 	.vop_islocked = 		vop_islocked,
 };
+VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_entries);
 
 /*
  * Same vector for mounts which do not use namecache.
  */
 struct vop_vector tmpfs_vnodeop_nonc_entries = {
 	.vop_default =			&tmpfs_vnodeop_entries,
 	.vop_lookup =			tmpfs_lookup,
 };
+VFS_VOP_VECTOR_REGISTER(tmpfs_vnodeop_nonc_entries);
Index: head/sys/fs/udf/udf_vnops.c
===================================================================
--- head/sys/fs/udf/udf_vnops.c	(revision 355789)
+++ head/sys/fs/udf/udf_vnops.c	(revision 355790)
@@ -1,1491 +1,1493 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001, 2002 Scott Long <scottl@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /* udf_vnops.c */
 /* Take care of the vnode side of things */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/iconv.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/queue.h>
 #include <sys/unistd.h>
 #include <sys/endian.h>
 
 #include <vm/uma.h>
 
 #include <fs/udf/ecma167-udf.h>
 #include <fs/udf/osta.h>
 #include <fs/udf/udf.h>
 #include <fs/udf/udf_mount.h>
 
 extern struct iconv_functions *udf_iconv;
 
 static vop_access_t	udf_access;
 static vop_getattr_t	udf_getattr;
 static vop_open_t	udf_open;
 static vop_ioctl_t	udf_ioctl;
 static vop_pathconf_t	udf_pathconf;
 static vop_print_t	udf_print;
 static vop_read_t	udf_read;
 static vop_readdir_t	udf_readdir;
 static vop_readlink_t	udf_readlink;
 static vop_setattr_t	udf_setattr;
 static vop_strategy_t	udf_strategy;
 static vop_bmap_t	udf_bmap;
 static vop_cachedlookup_t	udf_lookup;
 static vop_reclaim_t	udf_reclaim;
 static vop_vptofh_t	udf_vptofh;
 static int udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data);
 static int udf_bmap_internal(struct udf_node *node, off_t offset,
     daddr_t *sector, uint32_t *max_size);
 
 static struct vop_vector udf_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		udf_access,
 	.vop_bmap =		udf_bmap,
 	.vop_cachedlookup =	udf_lookup,
 	.vop_getattr =		udf_getattr,
 	.vop_ioctl =		udf_ioctl,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_open =		udf_open,
 	.vop_pathconf =		udf_pathconf,
 	.vop_print =		udf_print,
 	.vop_read =		udf_read,
 	.vop_readdir =		udf_readdir,
 	.vop_readlink =		udf_readlink,
 	.vop_reclaim =		udf_reclaim,
 	.vop_setattr =		udf_setattr,
 	.vop_strategy =		udf_strategy,
 	.vop_vptofh =		udf_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(udf_vnodeops);
 
 struct vop_vector udf_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		udf_access,
 	.vop_getattr =		udf_getattr,
 	.vop_pathconf =		udf_pathconf,
 	.vop_print =		udf_print,
 	.vop_reclaim =		udf_reclaim,
 	.vop_setattr =		udf_setattr,
 	.vop_vptofh =		udf_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(udf_fifoops);
 
 static MALLOC_DEFINE(M_UDFFID, "udf_fid", "UDF FileId structure");
 static MALLOC_DEFINE(M_UDFDS, "udf_ds", "UDF Dirstream structure");
 
 #define UDF_INVALID_BMAP	-1
 
 int
 udf_allocv(struct mount *mp, struct vnode **vpp, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 
 	error = getnewvnode("udf", mp, &udf_vnodeops, &vp);
 	if (error) {
 		printf("udf_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	*vpp = vp;
 	return (0);
 }
 
 /* Convert file entry permission (5 bits per owner/group/user) to a mode_t */
 static mode_t
 udf_permtomode(struct udf_node *node)
 {
 	uint32_t perm;
 	uint16_t flags;
 	mode_t mode;
 
 	perm = le32toh(node->fentry->perm);
 	flags = le16toh(node->fentry->icbtag.flags);
 
 	mode = perm & UDF_FENTRY_PERM_USER_MASK;
 	mode |= ((perm & UDF_FENTRY_PERM_GRP_MASK) >> 2);
 	mode |= ((perm & UDF_FENTRY_PERM_OWNER_MASK) >> 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_STICKY) << 4);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETGID) << 6);
 	mode |= ((flags & UDF_ICB_TAG_FLAGS_SETUID) << 8);
 
 	return (mode);
 }
 
 static int
 udf_access(struct vop_access_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	accmode_t accmode;
 	mode_t mode;
 
 	vp = a->a_vp;
 	node = VTON(vp);
 	accmode = a->a_accmode;
 
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			return (EROFS);
 			/* NOT REACHED */
 		default:
 			break;
 		}
 	}
 
 	mode = udf_permtomode(node);
 
 	return (vaccess(vp->v_type, mode, node->fentry->uid, node->fentry->gid,
 	    accmode, a->a_cred, NULL));
 }
 
 static int
 udf_open(struct vop_open_args *ap) {
 	struct udf_node *np = VTON(ap->a_vp);
 	off_t fsize;
 
 	fsize = le64toh(np->fentry->inf_len);
 	vnode_create_vobject(ap->a_vp, fsize, ap->a_td);
 	return 0;
 }
 
 static const int mon_lens[2][12] = {
 	{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334},
 	{0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335}
 };
 
 static int
 udf_isaleapyear(int year)
 {
 	int i;
 
 	i = (year % 4) ? 0 : 1;
 	i &= (year % 100) ? 1 : 0;
 	i |= (year % 400) ? 0 : 1;
 
 	return i;
 }
 
 /*
  * Timezone calculation compliments of Julian Elischer <julian@elischer.org>.
  */
 static void
 udf_timetotimespec(struct timestamp *time, struct timespec *t)
 {
 	int i, lpyear, daysinyear, year, startyear;
 	union {
 		uint16_t	u_tz_offset;
 		int16_t		s_tz_offset;
 	} tz;
 
 	/*
 	 * DirectCD seems to like using bogus year values.
 	 * Don't trust time->month as it will be used for an array index.
 	 */
 	year = le16toh(time->year);
 	if (year < 1970 || time->month < 1 || time->month > 12) {
 		t->tv_sec = 0;
 		t->tv_nsec = 0;
 		return;
 	}
 
 	/* Calculate the time and day */
 	t->tv_sec = time->second;
 	t->tv_sec += time->minute * 60;
 	t->tv_sec += time->hour * 3600;
 	t->tv_sec += (time->day - 1) * 3600 * 24;
 
 	/* Calculate the month */
 	lpyear = udf_isaleapyear(year);
 	t->tv_sec += mon_lens[lpyear][time->month - 1] * 3600 * 24;
 
 	/* Speed up the calculation */
 	startyear = 1970;
 	if (year > 2009) {
 		t->tv_sec += 1262304000;
 		startyear += 40;
 	} else if (year > 1999) {
 		t->tv_sec += 946684800;
 		startyear += 30;
 	} else if (year > 1989) {
 		t->tv_sec += 631152000;
 		startyear += 20;
 	} else if (year > 1979) {
 		t->tv_sec += 315532800;
 		startyear += 10;
 	}
 
 	daysinyear = (year - startyear) * 365;
 	for (i = startyear; i < year; i++)
 		daysinyear += udf_isaleapyear(i);
 	t->tv_sec += daysinyear * 3600 * 24;
 
 	/* Calculate microseconds */
 	t->tv_nsec = time->centisec * 10000 + time->hund_usec * 100 +
 	    time->usec;
 
 	/*
 	 * Calculate the time zone.  The timezone is 12 bit signed 2's
 	 * complement, so we gotta do some extra magic to handle it right.
 	 */
 	tz.u_tz_offset = le16toh(time->type_tz);
 	tz.u_tz_offset &= 0x0fff;
 	if (tz.u_tz_offset & 0x0800)
 		tz.u_tz_offset |= 0xf000;	/* extend the sign to 16 bits */
 	if ((le16toh(time->type_tz) & 0x1000) && (tz.s_tz_offset != -2047))
 		t->tv_sec -= tz.s_tz_offset * 60;
 
 	return;
 }
 
 static int
 udf_getattr(struct vop_getattr_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *node;
 	struct vattr *vap;
 	struct file_entry *fentry;
 	struct timespec ts;
 
 	ts.tv_sec = 0;
 
 	vp = a->a_vp;
 	vap = a->a_vap;
 	node = VTON(vp);
 	fentry = node->fentry;
 
 	vap->va_fsid = dev2udev(node->udfmp->im_dev);
 	vap->va_fileid = node->hash_id;
 	vap->va_mode = udf_permtomode(node);
 	vap->va_nlink = le16toh(fentry->link_cnt);
 	/*
 	 * XXX The spec says that -1 is valid for uid/gid and indicates an
 	 * invalid uid/gid.  How should this be represented?
 	 */
 	vap->va_uid = (le32toh(fentry->uid) == -1) ? 0 : le32toh(fentry->uid);
 	vap->va_gid = (le32toh(fentry->gid) == -1) ? 0 : le32toh(fentry->gid);
 	udf_timetotimespec(&fentry->atime, &vap->va_atime);
 	udf_timetotimespec(&fentry->mtime, &vap->va_mtime);
 	vap->va_ctime = vap->va_mtime; /* XXX Stored as an Extended Attribute */
 	vap->va_rdev = NODEV;
 	if (vp->v_type & VDIR) {
 		/*
 		 * Directories that are recorded within their ICB will show
 		 * as having 0 blocks recorded.  Since tradition dictates
 		 * that directories consume at least one logical block,
 		 * make it appear so.
 		 */
 		if (fentry->logblks_rec != 0) {
 			vap->va_size =
 			    le64toh(fentry->logblks_rec) * node->udfmp->bsize;
 		} else {
 			vap->va_size = node->udfmp->bsize;
 		}
 	} else {
 		vap->va_size = le64toh(fentry->inf_len);
 	}
 	vap->va_flags = 0;
 	vap->va_gen = 1;
 	vap->va_blocksize = node->udfmp->bsize;
 	vap->va_bytes = le64toh(fentry->inf_len);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = 0; /* XXX */
 	return (0);
 }
 
 static int
 udf_setattr(struct vop_setattr_args *a)
 {
 	struct vnode *vp;
 	struct vattr *vap;
 
 	vp = a->a_vp;
 	vap = a->a_vap;
 	if (vap->va_flags != (u_long)VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL)
 		return (EROFS);
 	if (vap->va_size != (u_quad_t)VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			return (EROFS);
 		case VCHR:
 		case VBLK:
 		case VSOCK:
 		case VFIFO:
 		case VNON:
 		case VBAD:
 		case VMARKER:
 			return (0);
 		}
 	}
 	return (0);
 }
 
 /*
  * File specific ioctls.
  */
 static int
 udf_ioctl(struct vop_ioctl_args *a)
 {
 	printf("%s called\n", __func__);
 	return (ENOTTY);
 }
 
 /*
  * I'm not sure that this has much value in a read-only filesystem, but
  * cd9660 has it too.
  */
 static int
 udf_pathconf(struct vop_pathconf_args *a)
 {
 
 	switch (a->a_name) {
 	case _PC_FILESIZEBITS:
 		*a->a_retval = 64;
 		return (0);
 	case _PC_LINK_MAX:
 		*a->a_retval = 65535;
 		return (0);
 	case _PC_NAME_MAX:
 		*a->a_retval = NAME_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*a->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*a->a_retval = 1;
 		return (0);
 	case _PC_PIPE_BUF:
 		if (a->a_vp->v_type == VDIR || a->a_vp->v_type == VFIFO) {
 			*a->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(a));
 	}
 }
 
 static int
 udf_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct udf_node *node = VTON(vp);
 
 	printf("    ino %lu, on dev %s", (u_long)node->hash_id,
 	    devtoname(node->udfmp->im_dev));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 #define lblkno(udfmp, loc)	((loc) >> (udfmp)->bshift)
 #define blkoff(udfmp, loc)	((loc) & (udfmp)->bmask)
 #define lblktosize(udfmp, blk)	((blk) << (udfmp)->bshift)
 
 static inline int
 is_data_in_fentry(const struct udf_node *node)
 {
 	const struct file_entry *fentry = node->fentry;
 
 	return ((le16toh(fentry->icbtag.flags) & 0x7) == 3);
 }
 
 static int
 udf_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct udf_node *node = VTON(vp);
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry;
 	struct buf *bp;
 	uint8_t *data;
 	daddr_t lbn, rablock;
 	off_t diff, fsize;
 	ssize_t n;
 	int error = 0;
 	long size, on;
 
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (is_data_in_fentry(node)) {
 		fentry = node->fentry;
 		data = &fentry->data[le32toh(fentry->l_ea)];
 		fsize = le32toh(fentry->l_ad);
 
 		n = uio->uio_resid;
 		diff = fsize - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		error = uiomove(data + uio->uio_offset, (int)n, uio);
 		return (error);
 	}
 
 	fsize = le64toh(node->fentry->inf_len);
 	udfmp = node->udfmp;
 	do {
 		lbn = lblkno(udfmp, uio->uio_offset);
 		on = blkoff(udfmp, uio->uio_offset);
 		n = min((u_int)(udfmp->bsize - on),
 			uio->uio_resid);
 		diff = fsize - uio->uio_offset;
 		if (diff <= 0)
 			return (0);
 		if (diff < n)
 			n = diff;
 		size = udfmp->bsize;
 		rablock = lbn + 1;
 		if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			if (lblktosize(udfmp, rablock) < fsize) {
 				error = cluster_read(vp, fsize, lbn, size,
 				    NOCRED, uio->uio_resid,
 				    (ap->a_ioflag >> 16), 0, &bp);
 			} else {
 				error = bread(vp, lbn, size, NOCRED, &bp);
 			}
 		} else {
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		}
 		if (error != 0) {
 			brelse(bp);
 			return (error);
 		}
 		n = min(n, size - bp->b_resid);
 
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	return (error);
 }
 
 /*
  * Call the OSTA routines to translate the name from a CS0 dstring to a
  * 16-bit Unicode String.  Hooks need to be placed in here to translate from
  * Unicode to the encoding that the kernel/user expects.  Return the length
  * of the translated string.
  */
 static int
 udf_transname(char *cs0string, char *destname, int len, struct udf_mnt *udfmp)
 {
 	unicode_t *transname;
 	char *unibuf, *unip;
 	int i, destlen;
 	ssize_t unilen = 0;
 	size_t destleft = MAXNAMLEN;
 
 	/* Convert 16-bit Unicode to destname */
 	if (udfmp->im_flags & UDFMNT_KICONV && udf_iconv) {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		unibuf = uma_zalloc(udf_zone_trans, M_WAITOK);
 		unip = unibuf;
 		if ((unilen = (ssize_t)udf_UncompressUnicodeByte(len, cs0string, unibuf)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, unibuf);
 			return 0;
 		}
 
 		while (unilen > 0 && destleft > 0) {
 			udf_iconv->conv(udfmp->im_d2l, __DECONST(const char **,
 			    &unibuf), (size_t *)&unilen, (char **)&destname,
 			    &destleft);
 			/* Unconverted character found */
 			if (unilen > 0 && destleft > 0) {
 				*destname++ = '?';
 				destleft--;
 				unibuf += 2;
 				unilen -= 2;
 			}
 		}
 		uma_zfree(udf_zone_trans, unip);
 		*destname = '\0';
 		destlen = MAXNAMLEN - (int)destleft;
 	} else {
 		/* allocate a buffer big enough to hold an 8->16 bit expansion */
 		transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 		if ((unilen = (ssize_t)udf_UncompressUnicode(len, cs0string, transname)) == -1) {
 			printf("udf: Unicode translation failed\n");
 			uma_zfree(udf_zone_trans, transname);
 			return 0;
 		}
 
 		for (i = 0; i < unilen ; i++) {
 			if (transname[i] & 0xff00) {
 				destname[i] = '.';	/* Fudge the 16bit chars */
 			} else {
 				destname[i] = transname[i] & 0xff;
 			}
 		}
 		uma_zfree(udf_zone_trans, transname);
 		destname[unilen] = 0;
 		destlen = (int)unilen;
 	}
 
 	return (destlen);
 }
 
 /*
  * Compare a CS0 dstring with a name passed in from the VFS layer.  Return
  * 0 on a successful match, nonzero otherwise.  Unicode work may need to be done
  * here also.
  */
 static int
 udf_cmpname(char *cs0string, char *cmpname, int cs0len, int cmplen, struct udf_mnt *udfmp)
 {
 	char *transname;
 	int error = 0;
 
 	/* This is overkill, but not worth creating a new zone */
 	transname = uma_zalloc(udf_zone_trans, M_WAITOK);
 
 	cs0len = udf_transname(cs0string, transname, cs0len, udfmp);
 
 	/* Easy check.  If they aren't the same length, they aren't equal */
 	if ((cs0len == 0) || (cs0len != cmplen))
 		error = -1;
 	else
 		error = bcmp(transname, cmpname, cmplen);
 
 	uma_zfree(udf_zone_trans, transname);
 	return (error);
 }
 
 struct udf_uiodir {
 	struct dirent *dirent;
 	u_long *cookies;
 	int ncookies;
 	int acookies;
 	int eofflag;
 };
 
 static int
 udf_uiodir(struct udf_uiodir *uiodir, int de_size, struct uio *uio, long cookie)
 {
 	if (uiodir->cookies != NULL) {
 		if (++uiodir->acookies > uiodir->ncookies) {
 			uiodir->eofflag = 0;
 			return (-1);
 		}
 		*uiodir->cookies++ = cookie;
 	}
 
 	if (uio->uio_resid < de_size) {
 		uiodir->eofflag = 0;
 		return (-1);
 	}
 
 	return (uiomove(uiodir->dirent, de_size, uio));
 }
 
 static struct udf_dirstream *
 udf_opendir(struct udf_node *node, int offset, int fsize, struct udf_mnt *udfmp)
 {
 	struct udf_dirstream *ds;
 
 	ds = uma_zalloc(udf_zone_ds, M_WAITOK | M_ZERO);
 
 	ds->node = node;
 	ds->offset = offset;
 	ds->udfmp = udfmp;
 	ds->fsize = fsize;
 
 	return (ds);
 }
 
 static struct fileid_desc *
 udf_getfid(struct udf_dirstream *ds)
 {
 	struct fileid_desc *fid;
 	int error, frag_size = 0, total_fid_size;
 
 	/* End of directory? */
 	if (ds->offset + ds->off >= ds->fsize) {
 		ds->error = 0;
 		return (NULL);
 	}
 
 	/* Grab the first extent of the directory */
 	if (ds->off == 0) {
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			if (ds->bp != NULL)
 				brelse(ds->bp);
 			return (NULL);
 		}
 	}
 
 	/*
 	 * Clean up from a previous fragmented FID.
 	 * XXX Is this the right place for this?
 	 */
 	if (ds->fid_fragment && ds->buf != NULL) {
 		ds->fid_fragment = 0;
 		free(ds->buf, M_UDFFID);
 	}
 
 	fid = (struct fileid_desc*)&ds->data[ds->off];
 
 	/*
 	 * Check to see if the fid is fragmented. The first test
 	 * ensures that we don't wander off the end of the buffer
 	 * looking for the l_iu and l_fi fields.
 	 */
 	if (ds->off + UDF_FID_SIZE > ds->size ||
 	    ds->off + le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE > ds->size){
 
 		/* Copy what we have of the fid into a buffer */
 		frag_size = ds->size - ds->off;
 		if (frag_size >= ds->udfmp->bsize) {
 			printf("udf: invalid FID fragment\n");
 			ds->error = EINVAL;
 			return (NULL);
 		}
 
 		/*
 		 * File ID descriptors can only be at most one
 		 * logical sector in size.
 		 */
 		ds->buf = malloc(ds->udfmp->bsize, M_UDFFID,
 		     M_WAITOK | M_ZERO);
 		bcopy(fid, ds->buf, frag_size);
 
 		/* Reduce all of the casting magic */
 		fid = (struct fileid_desc*)ds->buf;
 
 		if (ds->bp != NULL)
 			brelse(ds->bp);
 
 		/* Fetch the next allocation */
 		ds->offset += ds->size;
 		ds->size = 0;
 		error = udf_readatoffset(ds->node, &ds->size, ds->offset,
 		    &ds->bp, &ds->data);
 		if (error) {
 			ds->error = error;
 			return (NULL);
 		}
 
 		/*
 		 * If the fragment was so small that we didn't get
 		 * the l_iu and l_fi fields, copy those in.
 		 */
 		if (frag_size < UDF_FID_SIZE)
 			bcopy(ds->data, &ds->buf[frag_size],
 			    UDF_FID_SIZE - frag_size);
 
 		/*
 		 * Now that we have enough of the fid to work with,
 		 * copy in the rest of the fid from the new
 		 * allocation.
 		 */
 		total_fid_size = UDF_FID_SIZE + le16toh(fid->l_iu) + fid->l_fi;
 		if (total_fid_size > ds->udfmp->bsize) {
 			printf("udf: invalid FID\n");
 			ds->error = EIO;
 			return (NULL);
 		}
 		bcopy(ds->data, &ds->buf[frag_size],
 		    total_fid_size - frag_size);
 
 		ds->fid_fragment = 1;
 	} else {
 		total_fid_size = le16toh(fid->l_iu) + fid->l_fi + UDF_FID_SIZE;
 	}
 
 	/*
 	 * Update the offset. Align on a 4 byte boundary because the
 	 * UDF spec says so.
 	 */
 	ds->this_off = ds->offset + ds->off;
 	if (!ds->fid_fragment) {
 		ds->off += (total_fid_size + 3) & ~0x03;
 	} else {
 		ds->off = (total_fid_size - frag_size + 3) & ~0x03;
 	}
 
 	return (fid);
 }
 
 static void
 udf_closedir(struct udf_dirstream *ds)
 {
 
 	if (ds->bp != NULL)
 		brelse(ds->bp);
 
 	if (ds->fid_fragment && ds->buf != NULL)
 		free(ds->buf, M_UDFFID);
 
 	uma_zfree(udf_zone_ds, ds);
 }
 
 static int
 udf_readdir(struct vop_readdir_args *a)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct dirent dir;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid;
 	struct udf_uiodir uiodir;
 	struct udf_dirstream *ds;
 	u_long *cookies = NULL;
 	int ncookies;
 	int error = 0;
 
 	vp = a->a_vp;
 	uio = a->a_uio;
 	node = VTON(vp);
 	udfmp = node->udfmp;
 	uiodir.eofflag = 1;
 
 	if (a->a_ncookies != NULL) {
 		/*
 		 * Guess how many entries are needed.  If we run out, this
 		 * function will be called again and thing will pick up were
 		 * it left off.
 		 */
 		ncookies = uio->uio_resid / 8;
 		cookies = malloc(sizeof(u_long) * ncookies,
 		    M_TEMP, M_WAITOK);
 		if (cookies == NULL)
 			return (ENOMEM);
 		uiodir.ncookies = ncookies;
 		uiodir.cookies = cookies;
 		uiodir.acookies = 0;
 	} else {
 		uiodir.cookies = NULL;
 	}
 
 	/*
 	 * Iterate through the file id descriptors.  Give the parent dir
 	 * entry special attention.
 	 */
 	ds = udf_opendir(node, uio->uio_offset, le64toh(node->fentry->inf_len),
 	    node->udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("Invalid FID tag\n");
 			hexdump(fid, UDF_FID_SIZE, NULL, 0);
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			/* Do up the '.' and '..' entries.  Dummy values are
 			 * used for the cookies since the offset here is
 			 * usually zero, and NFS doesn't like that value
 			 */
 			dir.d_fileno = node->hash_id;
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_namlen = 1;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			dir.d_off = 1;
 			dirent_terminate(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 1);
 			if (error)
 				break;
 
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = DT_DIR;
 			dir.d_name[0] = '.';
 			dir.d_name[1] = '.';
 			dir.d_namlen = 2;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			dir.d_off = 2;
 			dirent_terminate(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio, 2);
 		} else {
 			dir.d_namlen = udf_transname(&fid->data[fid->l_iu],
 			    &dir.d_name[0], fid->l_fi, udfmp);
 			dir.d_fileno = udf_getid(&fid->icb);
 			dir.d_type = (fid->file_char & UDF_FILE_CHAR_DIR) ?
 			    DT_DIR : DT_UNKNOWN;
 			dir.d_reclen = GENERIC_DIRSIZ(&dir);
 			dir.d_off = ds->this_off;
 			dirent_terminate(&dir);
 			uiodir.dirent = &dir;
 			error = udf_uiodir(&uiodir, dir.d_reclen, uio,
 			    ds->this_off);
 		}
 		if (error)
 			break;
 		uio->uio_offset = ds->offset + ds->off;
 	}
 
 	/* tell the calling layer whether we need to be called again */
 	*a->a_eofflag = uiodir.eofflag;
 
 	if (error < 0)
 		error = 0;
 	if (!error)
 		error = ds->error;
 
 	udf_closedir(ds);
 
 	if (a->a_ncookies != NULL) {
 		if (error)
 			free(cookies, M_TEMP);
 		else {
 			*a->a_ncookies = uiodir.acookies;
 			*a->a_cookies = cookies;
 		}
 	}
 
 	return (error);
 }
 
 static int
 udf_readlink(struct vop_readlink_args *ap)
 {
 	struct path_component *pc, *end;
 	struct vnode *vp;
 	struct uio uio;
 	struct iovec iov[1];
 	struct udf_node *node;
 	void *buf;
 	char *cp;
 	int error, len, root;
 
 	/*
 	 * A symbolic link in UDF is a list of variable-length path
 	 * component structures.  We build a pathname in the caller's
 	 * uio by traversing this list.
 	 */
 	vp = ap->a_vp;
 	node = VTON(vp);
 	len = le64toh(node->fentry->inf_len);
 	buf = malloc(len, M_DEVBUF, M_WAITOK);
 	iov[0].iov_len = len;
 	iov[0].iov_base = buf;
 	uio.uio_iov = iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = iov[0].iov_len;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	error = VOP_READ(vp, &uio, 0, ap->a_cred);
 	if (error)
 		goto error;
 
 	pc = buf;
 	end = (void *)((char *)buf + len);
 	root = 0;
 	while (pc < end) {
 		switch (pc->type) {
 		case UDF_PATH_ROOT:
 			/* Only allow this at the beginning of a path. */
 			if ((void *)pc != buf) {
 				error = EINVAL;
 				goto error;
 			}
 			cp = "/";
 			len = 1;
 			root = 1;
 			break;
 		case UDF_PATH_DOT:
 			cp = ".";
 			len = 1;
 			break;
 		case UDF_PATH_DOTDOT:
 			cp = "..";
 			len = 2;
 			break;
 		case UDF_PATH_PATH:
 			if (pc->length == 0) {
 				error = EINVAL;
 				goto error;
 			}
 			/*
 			 * XXX: We only support CS8 which appears to map
 			 * to ASCII directly.
 			 */
 			switch (pc->identifier[0]) {
 			case 8:
 				cp = pc->identifier + 1;
 				len = pc->length - 1;
 				break;
 			default:
 				error = EOPNOTSUPP;
 				goto error;
 			}
 			break;
 		default:
 			error = EINVAL;
 			goto error;
 		}
 
 		/*
 		 * If this is not the first component, insert a path
 		 * separator.
 		 */
 		if (pc != buf) {
 			/* If we started with root we already have a "/". */
 			if (root)
 				goto skipslash;
 			root = 0;
 			if (ap->a_uio->uio_resid < 1) {
 				error = ENAMETOOLONG;
 				goto error;
 			}
 			error = uiomove("/", 1, ap->a_uio);
 			if (error)
 				break;
 		}
 	skipslash:
 
 		/* Append string at 'cp' of length 'len' to our path. */
 		if (len > ap->a_uio->uio_resid) {
 			error = ENAMETOOLONG;
 			goto error;
 		}
 		error = uiomove(cp, len, ap->a_uio);
 		if (error)
 			break;
 
 		/* Advance to next component. */
 		pc = (void *)((char *)pc + 4 + pc->length);
 	}
 error:
 	free(buf, M_DEVBUF);
 	return (error);
 }
 
 static int
 udf_strategy(struct vop_strategy_args *a)
 {
 	struct buf *bp;
 	struct vnode *vp;
 	struct udf_node *node;
 	struct bufobj *bo;
 	off_t offset;
 	uint32_t maxsize;
 	daddr_t sector;
 	int error;
 
 	bp = a->a_bp;
 	vp = a->a_vp;
 	node = VTON(vp);
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		offset = lblktosize(node->udfmp, bp->b_lblkno);
 		error = udf_bmap_internal(node, offset, &sector, &maxsize);
 		if (error) {
 			clrbuf(bp);
 			bp->b_blkno = -1;
 			bufdone(bp);
 			return (0);
 		}
 		/* bmap gives sector numbers, bio works with device blocks */
 		bp->b_blkno = sector << (node->udfmp->bshift - DEV_BSHIFT);
 	}
 	bo = node->udfmp->im_bo;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 static int
 udf_bmap(struct vop_bmap_args *a)
 {
 	struct udf_node *node;
 	uint32_t max_size;
 	daddr_t lsector;
 	int nblk;
 	int error;
 
 	node = VTON(a->a_vp);
 
 	if (a->a_bop != NULL)
 		*a->a_bop = &node->udfmp->im_devvp->v_bufobj;
 	if (a->a_bnp == NULL)
 		return (0);
 	if (a->a_runb)
 		*a->a_runb = 0;
 
 	/*
 	 * UDF_INVALID_BMAP means data embedded into fentry, this is an internal
 	 * error that should not be propagated to calling code.
 	 * Most obvious mapping for this error is EOPNOTSUPP as we can not truly
 	 * translate block numbers in this case.
 	 * Incidentally, this return code will make vnode pager to use VOP_READ
 	 * to get data for mmap-ed pages and udf_read knows how to do the right
 	 * thing for this kind of files.
 	 */
 	error = udf_bmap_internal(node, a->a_bn << node->udfmp->bshift,
 	    &lsector, &max_size);
 	if (error == UDF_INVALID_BMAP)
 		return (EOPNOTSUPP);
 	if (error)
 		return (error);
 
 	/* Translate logical to physical sector number */
 	*a->a_bnp = lsector << (node->udfmp->bshift - DEV_BSHIFT);
 
 	/*
 	 * Determine maximum number of readahead blocks following the
 	 * requested block.
 	 */
 	if (a->a_runp) {
 		nblk = (max_size >> node->udfmp->bshift) - 1;
 		if (nblk <= 0)
 			*a->a_runp = 0;
 		else if (nblk >= (MAXBSIZE >> node->udfmp->bshift))
 			*a->a_runp = (MAXBSIZE >> node->udfmp->bshift) - 1;
 		else
 			*a->a_runp = nblk;
 	}
 
 	if (a->a_runb) {
 		*a->a_runb = 0;
 	}
 
 	return (0);
 }
 
 /*
  * The all powerful VOP_LOOKUP().
  */
 static int
 udf_lookup(struct vop_cachedlookup_args *a)
 {
 	struct vnode *dvp;
 	struct vnode *tdp = NULL;
 	struct vnode **vpp = a->a_vpp;
 	struct udf_node *node;
 	struct udf_mnt *udfmp;
 	struct fileid_desc *fid = NULL;
 	struct udf_dirstream *ds;
 	u_long nameiop;
 	u_long flags;
 	char *nameptr;
 	long namelen;
 	ino_t id = 0;
 	int offset, error = 0;
 	int fsize, lkflags, ltype, numdirpasses;
 
 	dvp = a->a_dvp;
 	node = VTON(dvp);
 	udfmp = node->udfmp;
 	nameiop = a->a_cnp->cn_nameiop;
 	flags = a->a_cnp->cn_flags;
 	lkflags = a->a_cnp->cn_lkflags;
 	nameptr = a->a_cnp->cn_nameptr;
 	namelen = a->a_cnp->cn_namelen;
 	fsize = le64toh(node->fentry->inf_len);
 
 	/*
 	 * If this is a LOOKUP and we've already partially searched through
 	 * the directory, pick up where we left off and flag that the
 	 * directory may need to be searched twice.  For a full description,
 	 * see /sys/fs/cd9660/cd9660_lookup.c:cd9660_lookup()
 	 */
 	if (nameiop != LOOKUP || node->diroff == 0 || node->diroff > fsize) {
 		offset = 0;
 		numdirpasses = 1;
 	} else {
 		offset = node->diroff;
 		numdirpasses = 2;
 		nchstats.ncs_2passes++;
 	}
 
 lookloop:
 	ds = udf_opendir(node, offset, fsize, udfmp);
 
 	while ((fid = udf_getfid(ds)) != NULL) {
 
 		/* XXX Should we return an error on a bad fid? */
 		if (udf_checktag(&fid->tag, TAGID_FID)) {
 			printf("udf_lookup: Invalid tag\n");
 			error = EIO;
 			break;
 		}
 
 		/* Is this a deleted file? */
 		if (fid->file_char & UDF_FILE_CHAR_DEL)
 			continue;
 
 		if ((fid->l_fi == 0) && (fid->file_char & UDF_FILE_CHAR_PAR)) {
 			if (flags & ISDOTDOT) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		} else {
 			if (!(udf_cmpname(&fid->data[fid->l_iu],
 			    nameptr, fid->l_fi, namelen, udfmp))) {
 				id = udf_getid(&fid->icb);
 				break;
 			}
 		}
 	}
 
 	if (!error)
 		error = ds->error;
 
 	/* XXX Bail out here? */
 	if (error) {
 		udf_closedir(ds);
 		return (error);
 	}
 
 	/* Did we have a match? */
 	if (id) {
 		/*
 		 * Remember where this entry was if it's the final
 		 * component.
 		 */
 		if ((flags & ISLASTCN) && nameiop == LOOKUP)
 			node->diroff = ds->offset + ds->off;
 		if (numdirpasses == 2)
 			nchstats.ncs_pass2++;
 		udf_closedir(ds);
 
 		if (flags & ISDOTDOT) {
 			error = vn_vget_ino(dvp, id, lkflags, &tdp);
 		} else if (node->hash_id == id) {
 			VREF(dvp);	/* we want ourself, ie "." */
 			/*
 			 * When we lookup "." we still can be asked to lock it
 			 * differently.
 			 */
 			ltype = lkflags & LK_TYPE_MASK;
 			if (ltype != VOP_ISLOCKED(dvp)) {
 				if (ltype == LK_EXCLUSIVE)
 					vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 				else /* if (ltype == LK_SHARED) */
 					vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 			}
 			tdp = dvp;
 		} else
 			error = udf_vget(udfmp->im_mountp, id, lkflags, &tdp);
 		if (!error) {
 			*vpp = tdp;
 			/* Put this entry in the cache */
 			if (flags & MAKEENTRY)
 				cache_enter(dvp, *vpp, a->a_cnp);
 		}
 	} else {
 		/* Name wasn't found on this pass.  Do another pass? */
 		if (numdirpasses == 2) {
 			numdirpasses--;
 			offset = 0;
 			udf_closedir(ds);
 			goto lookloop;
 		}
 		udf_closedir(ds);
 
 		/* Enter name into cache as non-existant */
 		if (flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, a->a_cnp);
 
 		if ((flags & ISLASTCN) &&
 		    (nameiop == CREATE || nameiop == RENAME)) {
 			error = EROFS;
 		} else {
 			error = ENOENT;
 		}
 	}
 
 	return (error);
 }
 
 static int
 udf_reclaim(struct vop_reclaim_args *a)
 {
 	struct vnode *vp;
 	struct udf_node *unode;
 
 	vp = a->a_vp;
 	unode = VTON(vp);
 
 	if (unode != NULL) {
 		vfs_hash_remove(vp);
 
 		if (unode->fentry != NULL)
 			free(unode->fentry, M_UDFFENTRY);
 		uma_zfree(udf_zone_node, unode);
 		vp->v_data = NULL;
 	}
 
 	return (0);
 }
 
 static int
 udf_vptofh(struct vop_vptofh_args *a)
 {
 	struct udf_node *node;
 	struct ifid *ifhp;
 
 	node = VTON(a->a_vp);
 	ifhp = (struct ifid *)a->a_fhp;
 	ifhp->ifid_len = sizeof(struct ifid);
 	ifhp->ifid_ino = node->hash_id;
 
 	return (0);
 }
 
 /*
  * Read the block and then set the data pointer to correspond with the
  * offset passed in.  Only read in at most 'size' bytes, and then set 'size'
  * to the number of bytes pointed to.  If 'size' is zero, try to read in a
  * whole extent.
  *
  * Note that *bp may be assigned error or not.
  *
  */
 static int
 udf_readatoffset(struct udf_node *node, int *size, off_t offset,
     struct buf **bp, uint8_t **data)
 {
 	struct udf_mnt *udfmp = node->udfmp;
 	struct vnode *vp = node->i_vnode;
 	struct file_entry *fentry;
 	struct buf *bp1;
 	uint32_t max_size;
 	daddr_t sector;
 	off_t off;
 	int adj_size;
 	int error;
 
 	/*
 	 * This call is made *not* only to detect UDF_INVALID_BMAP case,
 	 * max_size is used as an ad-hoc read-ahead hint for "normal" case.
 	 */
 	error = udf_bmap_internal(node, offset, &sector, &max_size);
 	if (error == UDF_INVALID_BMAP) {
 		/*
 		 * This error means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		fentry = node->fentry;
 		*data = &fentry->data[le32toh(fentry->l_ea)];
 		*size = le32toh(fentry->l_ad);
 		if (offset >= *size)
 			*size = 0;
 		else {
 			*data += offset;
 			*size -= offset;
 		}
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	/* Adjust the size so that it is within range */
 	if (*size == 0 || *size > max_size)
 		*size = max_size;
 
 	/*
 	 * Because we will read starting at block boundary, we need to adjust
 	 * how much we need to read so that all promised data is in.
 	 * Also, we can't promise to read more than MAXBSIZE bytes starting
 	 * from block boundary, so adjust what we promise too.
 	 */
 	off = blkoff(udfmp, offset);
 	*size = min(*size, MAXBSIZE - off);
 	adj_size = (*size + off + udfmp->bmask) & ~udfmp->bmask;
 	*bp = NULL;
 	if ((error = bread(vp, lblkno(udfmp, offset), adj_size, NOCRED, bp))) {
 		printf("warning: udf_readlblks returned error %d\n", error);
 		/* note: *bp may be non-NULL */
 		return (error);
 	}
 
 	bp1 = *bp;
 	*data = (uint8_t *)&bp1->b_data[offset & udfmp->bmask];
 	return (0);
 }
 
 /*
  * Translate a file offset into a logical block and then into a physical
  * block.
  * max_size - maximum number of bytes that can be read starting from given
  * offset, rather than beginning of calculated sector number
  */
 static int
 udf_bmap_internal(struct udf_node *node, off_t offset, daddr_t *sector,
     uint32_t *max_size)
 {
 	struct udf_mnt *udfmp;
 	struct file_entry *fentry;
 	void *icb;
 	struct icb_tag *tag;
 	uint32_t icblen = 0;
 	daddr_t lsector;
 	int ad_offset, ad_num = 0;
 	int i, p_offset;
 
 	udfmp = node->udfmp;
 	fentry = node->fentry;
 	tag = &fentry->icbtag;
 
 	switch (le16toh(tag->strat_type)) {
 	case 4:
 		break;
 
 	case 4096:
 		printf("Cannot deal with strategy4096 yet!\n");
 		return (ENODEV);
 
 	default:
 		printf("Unknown strategy type %d\n", tag->strat_type);
 		return (ENODEV);
 	}
 
 	switch (le16toh(tag->flags) & 0x7) {
 	case 0:
 		/*
 		 * The allocation descriptor field is filled with short_ad's.
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct short_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(short_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(short_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset  >> udfmp->bshift) +
 		    le32toh(((struct short_ad *)(icb))->pos);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 1:
 		/*
 		 * The allocation descriptor field is filled with long_ad's
 		 * If the offset is beyond the current extent, look for the
 		 * next extent.
 		 */
 		do {
 			offset -= icblen;
 			ad_offset = sizeof(struct long_ad) * ad_num;
 			if (ad_offset > le32toh(fentry->l_ad)) {
 				printf("File offset out of bounds\n");
 				return (EINVAL);
 			}
 			icb = GETICB(long_ad, fentry,
 			    le32toh(fentry->l_ea) + ad_offset);
 			icblen = GETICBLEN(long_ad, icb);
 			ad_num++;
 		} while(offset >= icblen);
 
 		lsector = (offset >> udfmp->bshift) +
 		    le32toh(((struct long_ad *)(icb))->loc.lb_num);
 
 		*max_size = icblen - offset;
 
 		break;
 	case 3:
 		/*
 		 * This type means that the file *data* is stored in the
 		 * allocation descriptor field of the file entry.
 		 */
 		*max_size = 0;
 		*sector = node->hash_id + udfmp->part_start;
 
 		return (UDF_INVALID_BMAP);
 	case 2:
 		/* DirectCD does not use extended_ad's */
 	default:
 		printf("Unsupported allocation descriptor %d\n",
 		       tag->flags & 0x7);
 		return (ENODEV);
 	}
 
 	*sector = lsector + udfmp->part_start;
 
 	/*
 	 * Check the sparing table.  Each entry represents the beginning of
 	 * a packet.
 	 */
 	if (udfmp->s_table != NULL) {
 		for (i = 0; i< udfmp->s_table_entries; i++) {
 			p_offset =
 			    lsector - le32toh(udfmp->s_table->entries[i].org);
 			if ((p_offset < udfmp->p_sectors) && (p_offset >= 0)) {
 				*sector =
 				   le32toh(udfmp->s_table->entries[i].map) +
 				    p_offset;
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
Index: head/sys/fs/unionfs/union_vnops.c
===================================================================
--- head/sys/fs/unionfs/union_vnops.c	(revision 355789)
+++ head/sys/fs/unionfs/union_vnops.c	(revision 355790)
@@ -1,2558 +1,2559 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry.
  * Copyright (c) 1992, 1993, 1994, 1995
  *      The Regents of the University of California.
  * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc.
  * Copyright (c) 2006, 2012 Daichi Goto <daichi@freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_vnops.c	8.32 (Berkeley) 6/23/95
  * $FreeBSD$
  *
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/kdb.h>
 #include <sys/fcntl.h>
 #include <sys/stat.h>
 #include <sys/dirent.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 
 #include <fs/unionfs/union.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 
 #if 0
 #define UNIONFS_INTERNAL_DEBUG(msg, args...)    printf(msg, ## args)
 #define UNIONFS_IDBG_RENAME
 #else
 #define UNIONFS_INTERNAL_DEBUG(msg, args...)
 #endif
 
 #define KASSERT_UNIONFS_VNODE(vp) \
 	KASSERT(((vp)->v_op == &unionfs_vnodeops), \
 	    ("unionfs: it is not unionfs-vnode"))
 
 static int
 unionfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int		iswhiteout;
 	int		lockflag;
 	int		error , uerror, lerror;
 	u_long		nameiop;
 	u_long		cnflags, cnflagsbk;
 	struct unionfs_node *dunp;
 	struct vnode   *dvp, *udvp, *ldvp, *vp, *uvp, *lvp, *dtmpvp;
 	struct vattr	va;
 	struct componentname *cnp;
 	struct thread  *td;
 
 	iswhiteout = 0;
 	lockflag = 0;
 	error = uerror = lerror = ENOENT;
 	cnp = ap->a_cnp;
 	nameiop = cnp->cn_nameiop;
 	cnflags = cnp->cn_flags;
 	dvp = ap->a_dvp;
 	dunp = VTOUNIONFS(dvp);
 	udvp = dunp->un_uppervp;
 	ldvp = dunp->un_lowervp;
 	vp = uvp = lvp = NULLVP;
 	td = curthread;
 	*(ap->a_vpp) = NULLVP;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lookup: enter: nameiop=%ld, flags=%lx, path=%s\n", nameiop, cnflags, cnp->cn_nameptr);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	/*
 	 * If read-only and op is not LOOKUP, will return EROFS.
 	 */
 	if ((cnflags & ISLASTCN) &&
 	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    LOOKUP != nameiop)
 		return (EROFS);
 
 	/*
 	 * lookup dotdot
 	 */
 	if (cnflags & ISDOTDOT) {
 		if (LOOKUP != nameiop && udvp == NULLVP)
 			return (EROFS);
 
 		if (udvp != NULLVP) {
 			dtmpvp = udvp;
 			if (ldvp != NULLVP)
 				VOP_UNLOCK(ldvp, LK_RELEASE);
 		}
 		else
 			dtmpvp = ldvp;
 
 		error = VOP_LOOKUP(dtmpvp, &vp, cnp);
 
 		if (dtmpvp == udvp && ldvp != NULLVP) {
 			VOP_UNLOCK(udvp, LK_RELEASE);
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		}
 
 		if (error == 0) {
 			/*
 			 * Exchange lock and reference from vp to
 			 * dunp->un_dvp. vp is upper/lower vnode, but it
 			 * will need to return the unionfs vnode.
 			 */
 			if (nameiop == DELETE  || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
 				VOP_UNLOCK(vp, LK_RELEASE);
 			vrele(vp);
 
 			VOP_UNLOCK(dvp, LK_RELEASE);
 			*(ap->a_vpp) = dunp->un_dvp;
 			vref(dunp->un_dvp);
 
 			if (nameiop == DELETE || nameiop == RENAME)
 				vn_lock(dunp->un_dvp, LK_EXCLUSIVE | LK_RETRY);
 			else if (cnp->cn_lkflags & LK_TYPE_MASK)
 				vn_lock(dunp->un_dvp, cnp->cn_lkflags |
 				    LK_RETRY);
 
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		} else if (error == ENOENT && (cnflags & MAKEENTRY) != 0)
 			cache_enter(dvp, NULLVP, cnp);
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error);
 
 		return (error);
 	}
 
 	/*
 	 * lookup upper layer
 	 */
 	if (udvp != NULLVP) {
 		uerror = VOP_LOOKUP(udvp, &uvp, cnp);
 
 		if (uerror == 0) {
 			if (udvp == uvp) {	/* is dot */
 				vrele(uvp);
 				*(ap->a_vpp) = dvp;
 				vref(dvp);
 
 				UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", uerror);
 
 				return (uerror);
 			}
 			if (nameiop == DELETE || nameiop == RENAME ||
 			    (cnp->cn_lkflags & LK_TYPE_MASK))
 				VOP_UNLOCK(uvp, LK_RELEASE);
 		}
 
 		/* check whiteout */
 		if (uerror == ENOENT || uerror == EJUSTRETURN)
 			if (cnp->cn_flags & ISWHITEOUT)
 				iswhiteout = 1;	/* don't lookup lower */
 		if (iswhiteout == 0 && ldvp != NULLVP)
 			if (!VOP_GETATTR(udvp, &va, cnp->cn_cred) &&
 			    (va.va_flags & OPAQUE))
 				iswhiteout = 1;	/* don't lookup lower */
 #if 0
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: debug: whiteout=%d, path=%s\n", iswhiteout, cnp->cn_nameptr);
 #endif
 	}
 
 	/*
 	 * lookup lower layer
 	 */
 	if (ldvp != NULLVP && !(cnflags & DOWHITEOUT) && iswhiteout == 0) {
 		/* always op is LOOKUP */
 		cnp->cn_nameiop = LOOKUP;
 		cnflagsbk = cnp->cn_flags;
 		cnp->cn_flags = cnflags;
 
 		lerror = VOP_LOOKUP(ldvp, &lvp, cnp);
 
 		cnp->cn_nameiop = nameiop;
 		if (udvp != NULLVP && (uerror == 0 || uerror == EJUSTRETURN))
 			cnp->cn_flags = cnflagsbk;
 
 		if (lerror == 0) {
 			if (ldvp == lvp) {	/* is dot */
 				if (uvp != NULLVP)
 					vrele(uvp);	/* no need? */
 				vrele(lvp);
 				*(ap->a_vpp) = dvp;
 				vref(dvp);
 
 				UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", lerror);
 
 				return (lerror);
 			}
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
 				VOP_UNLOCK(lvp, LK_RELEASE);
 		}
 	}
 
 	/*
 	 * check lookup result
 	 */
 	if (uvp == NULLVP && lvp == NULLVP) {
 		UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n",
 		    (udvp != NULLVP ? uerror : lerror));
 		return (udvp != NULLVP ? uerror : lerror);
 	}
 
 	/*
 	 * check vnode type
 	 */
 	if (uvp != NULLVP && lvp != NULLVP && uvp->v_type != lvp->v_type) {
 		vrele(lvp);
 		lvp = NULLVP;
 	}
 
 	/*
 	 * check shadow dir
 	 */
 	if (uerror != 0 && uerror != EJUSTRETURN && udvp != NULLVP &&
 	    lerror == 0 && lvp != NULLVP && lvp->v_type == VDIR &&
 	    !(dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (1 < cnp->cn_namelen || '.' != *(cnp->cn_nameptr))) {
 		/* get unionfs vnode in order to create a new shadow dir. */
 		error = unionfs_nodeget(dvp->v_mount, NULLVP, lvp, dvp, &vp,
 		    cnp, td);
 		if (error != 0)
 			goto unionfs_lookup_out;
 
 		if (LK_SHARED == (cnp->cn_lkflags & LK_TYPE_MASK))
 			VOP_UNLOCK(vp, LK_RELEASE);
 		if (LK_EXCLUSIVE != VOP_ISLOCKED(vp)) {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 			lockflag = 1;
 		}
 		error = unionfs_mkshadowdir(MOUNTTOUNIONFSMOUNT(dvp->v_mount),
 		    udvp, VTOUNIONFS(vp), cnp, td);
 		if (lockflag != 0)
 			VOP_UNLOCK(vp, LK_RELEASE);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create shadow dir.");
 			if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE)
 				vput(vp);
 			else
 				vrele(vp);
 			goto unionfs_lookup_out;
 		}
 		if ((cnp->cn_lkflags & LK_TYPE_MASK) == LK_SHARED)
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 	}
 	/*
 	 * get unionfs vnode.
 	 */
 	else {
 		if (uvp != NULLVP)
 			error = uerror;
 		else
 			error = lerror;
 		if (error != 0)
 			goto unionfs_lookup_out;
 		/*
 		 * get socket vnode.
 		 */
 		if (uvp != NULLVP && uvp->v_type == VSOCK) {
 			vp = uvp;
 			vref(vp);
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
 				vn_lock(vp, cnp->cn_lkflags | LK_RETRY);
 		}
 		else if (lvp != NULLVP && lvp->v_type == VSOCK) {
 			vp = lvp;
 			vref(vp);
 			if (cnp->cn_lkflags & LK_TYPE_MASK)
 				vn_lock(vp, cnp->cn_lkflags | LK_RETRY);
 		}
 		/*
 		 * get unionfs vnode.
 		 */
 		else
 			error = unionfs_nodeget(dvp->v_mount, uvp, lvp,
 			    dvp, &vp, cnp, td);
 		if (error != 0) {
 			UNIONFSDEBUG("unionfs_lookup: Unable to create unionfs vnode.");
 			goto unionfs_lookup_out;
 		}
 		if ((nameiop == DELETE || nameiop == RENAME) &&
 		    (cnp->cn_lkflags & LK_TYPE_MASK) == 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	*(ap->a_vpp) = vp;
 
 	if ((cnflags & MAKEENTRY) && vp->v_type != VSOCK)
 		cache_enter(dvp, vp, cnp);
 
 unionfs_lookup_out:
 	if (uvp != NULLVP)
 		vrele(uvp);
 	if (lvp != NULLVP)
 		vrele(lvp);
 
 	if (error == ENOENT && (cnflags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULLVP, cnp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_lookup: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_create(struct vop_create_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct vnode   *udvp;
 	struct vnode   *vp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_create: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	udvp = dunp->un_uppervp;
 	error = EROFS;
 
 	if (udvp != NULLVP) {
 		error = VOP_CREATE(udvp, &vp, cnp, ap->a_vap);
 		if (error != 0)
 			goto unionfs_create_abort;
 
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
 			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
 		}
 	}
 
 unionfs_create_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_create: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_whiteout(struct vop_whiteout_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct vnode   *udvp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	udvp = dunp->un_uppervp;
 	error = EOPNOTSUPP;
 
 	if (udvp != NULLVP) {
 		switch (ap->a_flags) {
 		case CREATE:
 		case DELETE:
 		case LOOKUP:
 			error = VOP_WHITEOUT(udvp, cnp, ap->a_flags);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_whiteout: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_mknod(struct vop_mknod_args *ap)
 {
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct vnode   *udvp;
 	struct vnode   *vp;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mknod: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	udvp = dunp->un_uppervp;
 	error = EROFS;
 
 	if (udvp != NULLVP) {
 		error = VOP_MKNOD(udvp, &vp, cnp, ap->a_vap);
 		if (error != 0)
 			goto unionfs_mknod_abort;
 
 		if (vp->v_type == VSOCK)
 			*(ap->a_vpp) = vp;
 		else {
 			VOP_UNLOCK(vp, LK_RELEASE);
 			error = unionfs_nodeget(ap->a_dvp->v_mount, vp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, curthread);
 			vrele(vp);
 		}
 	}
 
 unionfs_mknod_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_mknod: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_open(struct vop_open_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *targetvp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_open: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	targetvp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	unionfs_get_node_status(unp, td, &unsp);
 
 	if (unsp->uns_lower_opencnt > 0 || unsp->uns_upper_opencnt > 0) {
 		/* vnode is already opend. */
 		if (unsp->uns_upper_opencnt > 0)
 			targetvp = uvp;
 		else
 			targetvp = lvp;
 
 		if (targetvp == lvp &&
 		    (ap->a_mode & FWRITE) && lvp->v_type == VREG)
 			targetvp = NULLVP;
 	}
 	if (targetvp == NULLVP) {
 		if (uvp == NULLVP) {
 			if ((ap->a_mode & FWRITE) && lvp->v_type == VREG) {
 				error = unionfs_copyfile(unp,
 				    !(ap->a_mode & O_TRUNC), cred, td);
 				if (error != 0)
 					goto unionfs_open_abort;
 				targetvp = uvp = unp->un_uppervp;
 			} else
 				targetvp = lvp;
 		} else
 			targetvp = uvp;
 	}
 
 	error = VOP_OPEN(targetvp, ap->a_mode, cred, td, ap->a_fp);
 	if (error == 0) {
 		if (targetvp == uvp) {
 			if (uvp->v_type == VDIR && lvp != NULLVP &&
 			    unsp->uns_lower_opencnt <= 0) {
 				/* open lower for readdir */
 				error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
 				if (error != 0) {
 					VOP_CLOSE(uvp, ap->a_mode, cred, td);
 					goto unionfs_open_abort;
 				}
 				unsp->uns_node_flag |= UNS_OPENL_4_READDIR;
 				unsp->uns_lower_opencnt++;
 			}
 			unsp->uns_upper_opencnt++;
 		} else {
 			unsp->uns_lower_opencnt++;
 			unsp->uns_lower_openmode = ap->a_mode;
 		}
 		ap->a_vp->v_object = targetvp->v_object;
 	}
 
 unionfs_open_abort:
 	if (error != 0)
 		unionfs_tryrem_node_status(unp, unsp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_open: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_close(struct vop_close_args *ap)
 {
 	int		error;
 	int		locked;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct ucred   *cred;
 	struct thread  *td;
 	struct vnode   *vp;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	locked = 0;
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 		if (vn_lock(vp, LK_UPGRADE) != 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		locked = 1;
 	}
 	unionfs_get_node_status(unp, td, &unsp);
 
 	if (unsp->uns_lower_opencnt <= 0 && unsp->uns_upper_opencnt <= 0) {
 #ifdef DIAGNOSTIC
 		printf("unionfs_close: warning: open count is 0\n");
 #endif
 		if (unp->un_uppervp != NULLVP)
 			ovp = unp->un_uppervp;
 		else
 			ovp = unp->un_lowervp;
 	} else if (unsp->uns_upper_opencnt > 0)
 		ovp = unp->un_uppervp;
 	else
 		ovp = unp->un_lowervp;
 
 	error = VOP_CLOSE(ovp, ap->a_fflag, cred, td);
 
 	if (error != 0)
 		goto unionfs_close_abort;
 
 	vp->v_object = ovp->v_object;
 
 	if (ovp == unp->un_uppervp) {
 		unsp->uns_upper_opencnt--;
 		if (unsp->uns_upper_opencnt == 0) {
 			if (unsp->uns_node_flag & UNS_OPENL_4_READDIR) {
 				VOP_CLOSE(unp->un_lowervp, FREAD, cred, td);
 				unsp->uns_node_flag &= ~UNS_OPENL_4_READDIR;
 				unsp->uns_lower_opencnt--;
 			}
 			if (unsp->uns_lower_opencnt > 0)
 				vp->v_object = unp->un_lowervp->v_object;
 		}
 	} else
 		unsp->uns_lower_opencnt--;
 
 unionfs_close_abort:
 	unionfs_tryrem_node_status(unp, unsp);
 
 	if (locked != 0)
 		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_close: leave (%d)\n", error);
 
 	return (error);
 }
 
 /*
  * Check the access mode toward shadow file/dir.
  */
 static int
 unionfs_check_corrected_access(accmode_t accmode,
 			     struct vattr *va,
 			     struct ucred *cred)
 {
 	int		count;
 	uid_t		uid;	/* upper side vnode's uid */
 	gid_t		gid;	/* upper side vnode's gid */
 	u_short		vmode;	/* upper side vnode's mode */
 	u_short		mask;
 
 	mask = 0;
 	uid = va->va_uid;
 	gid = va->va_gid;
 	vmode = va->va_mode;
 
 	/* check owner */
 	if (cred->cr_uid == uid) {
 		if (accmode & VEXEC)
 			mask |= S_IXUSR;
 		if (accmode & VREAD)
 			mask |= S_IRUSR;
 		if (accmode & VWRITE)
 			mask |= S_IWUSR;
 		return ((vmode & mask) == mask ? 0 : EACCES);
 	}
 
 	/* check group */
 	count = 0;
 	if (groupmember(gid, cred)) {
 		if (accmode & VEXEC)
 			mask |= S_IXGRP;
 		if (accmode & VREAD)
 			mask |= S_IRGRP;
 		if (accmode & VWRITE)
 			mask |= S_IWGRP;
 		return ((vmode & mask) == mask ? 0 : EACCES);
 	}
 
 	/* check other */
 	if (accmode & VEXEC)
 		mask |= S_IXOTH;
 	if (accmode & VREAD)
 		mask |= S_IROTH;
 	if (accmode & VWRITE)
 		mask |= S_IWOTH;
 
 	return ((vmode & mask) == mask ? 0 : EACCES);
 }
 
 static int
 unionfs_access(struct vop_access_args *ap)
 {
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr	va;
 	accmode_t	accmode;
 	int		error;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_access: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 	accmode = ap->a_accmode;
 	error = EACCES;
 
 	if ((accmode & VWRITE) &&
 	    (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) {
 		switch (ap->a_vp->v_type) {
 		case VREG:
 		case VDIR:
 		case VLNK:
 			return (EROFS);
 		default:
 			break;
 		}
 	}
 
 	if (uvp != NULLVP) {
 		error = VOP_ACCESS(uvp, accmode, ap->a_cred, td);
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error);
 
 		return (error);
 	}
 
 	if (lvp != NULLVP) {
 		if (accmode & VWRITE) {
 			if (ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY) {
 				switch (ap->a_vp->v_type) {
 				case VREG:
 				case VDIR:
 				case VLNK:
 					return (EROFS);
 				default:
 					break;
 				}
 			} else if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) {
 				/* check shadow file/dir */
 				if (ump->um_copymode != UNIONFS_TRANSPARENT) {
 					error = unionfs_create_uppervattr(ump,
 					    lvp, &va, ap->a_cred, td);
 					if (error != 0)
 						return (error);
 
 					error = unionfs_check_corrected_access(
 					    accmode, &va, ap->a_cred);
 					if (error != 0)
 						return (error);
 				}
 			}
 			accmode &= ~(VWRITE | VAPPEND);
 			accmode |= VREAD; /* will copy to upper */
 		}
 		error = VOP_ACCESS(lvp, accmode, ap->a_cred, td);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_access: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_getattr(struct vop_getattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr	va;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getattr: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = curthread;
 
 	if (uvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, ap->a_vap, ap->a_cred)) == 0)
 			ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 
 		UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n",
 		    ap->a_vap->va_mode, ap->a_vap->va_uid,
 		    ap->a_vap->va_gid, error);
 
 		return (error);
 	}
 
 	error = VOP_GETATTR(lvp, ap->a_vap, ap->a_cred);
 
 	if (error == 0 && !(ump->um_uppervp->v_mount->mnt_flag & MNT_RDONLY)) {
 		/* correct the attr toward shadow file/dir. */
 		if (ap->a_vp->v_type == VREG || ap->a_vp->v_type == VDIR) {
 			unionfs_create_uppervattr_core(ump, ap->a_vap, &va, td);
 			ap->a_vap->va_mode = va.va_mode;
 			ap->a_vap->va_uid = va.va_uid;
 			ap->a_vap->va_gid = va.va_gid;
 		}
 	}
 
 	if (error == 0)
 		ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getattr: leave mode=%o, uid=%d, gid=%d (%d)\n",
 	    ap->a_vap->va_mode, ap->a_vap->va_uid, ap->a_vap->va_gid, error);
 
 	return (error);
 }
 
 static int
 unionfs_setattr(struct vop_setattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr   *vap;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setattr: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = curthread;
 	vap = ap->a_vap;
 
 	if ((ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
 	     vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
 	     vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL))
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		error = unionfs_copyfile(unp, (vap->va_size != 0),
 		    ap->a_cred, td);
 		if (error != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETATTR(uvp, vap, ap->a_cred);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_read(struct vop_read_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *tvp;
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_read: enter\n"); */
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_READ(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_read: leave (%d)\n", error); */
 
 	return (error);
 }
 
 static int
 unionfs_write(struct vop_write_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *tvp;
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_write: enter\n"); */
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_WRITE(tvp, ap->a_uio, ap->a_ioflag, ap->a_cred);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_write: leave (%d)\n", error); */
 
 	return (error);
 }
 
 static int
 unionfs_ioctl(struct vop_ioctl_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
  	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
 	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	error = VOP_IOCTL(ovp, ap->a_command, ap->a_data, ap->a_fflag,
 	    ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_ioctl: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_poll(struct vop_poll_args *ap)
 {
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
  	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
 	VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	return (VOP_POLL(ovp, ap->a_events, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_fsync(struct vop_fsync_args *ap)
 {
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *ovp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	unionfs_get_node_status(unp, ap->a_td, &unsp);
 	ovp = (unsp->uns_upper_opencnt ? unp->un_uppervp : unp->un_lowervp);
 	unionfs_tryrem_node_status(unp, unsp);
 
 	if (ovp == NULLVP)
 		return (EBADF);
 
 	return (VOP_FSYNC(ovp, ap->a_waitfor, ap->a_td));
 }
 
 static int
 unionfs_remove(struct vop_remove_args *ap)
 {
 	int		error;
 	char	       *path;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *vp;
 	struct componentname *cnp;
 	struct componentname cn;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_remove: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	error = 0;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	udvp = dunp->un_uppervp;
 	cnp = ap->a_cnp;
 	td = curthread;
 
 	if (ap->a_vp->v_op != &unionfs_vnodeops) {
 		if (ap->a_vp->v_type != VSOCK)
 			return (EINVAL);
 		ump = NULL;
 		vp = uvp = lvp = NULLVP;
 		/* search vnode */
 		VOP_UNLOCK(ap->a_vp, LK_RELEASE);
 		error = unionfs_relookup(udvp, &vp, cnp, &cn, td,
 		    cnp->cn_nameptr, strlen(cnp->cn_nameptr), DELETE);
 		if (error != 0 && error != ENOENT) {
 			vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 			return (error);
 		}
 
 		if (error == 0 && vp == ap->a_vp) {
 			/* target vnode in upper */
 			uvp = vp;
 			vrele(vp);
 			path = NULL;
 		} else {
 			/* target vnode in lower */
 			if (vp != NULLVP) {
 				if (udvp == vp)
 					vrele(vp);
 				else
 					vput(vp);
 			}
 			vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
 			lvp = ap->a_vp;
 			path = ap->a_cnp->cn_nameptr;
 		}
 	} else {
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		unp = VTOUNIONFS(ap->a_vp);
 		uvp = unp->un_uppervp;
 		lvp = unp->un_lowervp;
 		path = unp->un_path;
 	}
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (uvp != NULLVP) {
 		/*
 		 * XXX: if the vnode type is VSOCK, it will create whiteout
 		 *      after remove.
 		 */
 		if (ump == NULL || ump->um_whitemode == UNIONFS_WHITE_ALWAYS ||
 		    lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
 		error = VOP_REMOVE(udvp, uvp, cnp);
 	} else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, path);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_remove: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_link(struct vop_link_args *ap)
 {
 	int		error;
 	int		needrelookup;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct componentname *cnp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_link: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_tdvp);
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	needrelookup = 0;
 	dunp = VTOUNIONFS(ap->a_tdvp);
 	unp = NULL;
 	udvp = dunp->un_uppervp;
 	uvp = NULLVP;
 	cnp = ap->a_cnp;
 	td = curthread;
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (ap->a_vp->v_op != &unionfs_vnodeops)
 		uvp = ap->a_vp;
 	else {
 		unp = VTOUNIONFS(ap->a_vp);
 
 		if (unp->un_uppervp == NULLVP) {
 			if (ap->a_vp->v_type != VREG)
 				return (EOPNOTSUPP);
 
 			error = unionfs_copyfile(unp, 1, cnp->cn_cred, td);
 			if (error != 0)
 				return (error);
 			needrelookup = 1;
 		}
 		uvp = unp->un_uppervp;
 	}
 
 	if (needrelookup != 0)
 		error = unionfs_relookup_for_create(ap->a_tdvp, cnp, td);
 
 	if (error == 0)
 		error = VOP_LINK(udvp, uvp, cnp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_link: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_rename(struct vop_rename_args *ap)
 {
 	int		error;
 	struct vnode   *fdvp;
 	struct vnode   *fvp;
 	struct componentname *fcnp;
 	struct vnode   *tdvp;
 	struct vnode   *tvp;
 	struct componentname *tcnp;
 	struct vnode   *ltdvp;
 	struct vnode   *ltvp;
 	struct thread  *td;
 
 	/* rename target vnodes */
 	struct vnode   *rfdvp;
 	struct vnode   *rfvp;
 	struct vnode   *rtdvp;
 	struct vnode   *rtvp;
 
 	int		needrelookup;
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: enter\n");
 
 	error = 0;
 	fdvp = ap->a_fdvp;
 	fvp = ap->a_fvp;
 	fcnp = ap->a_fcnp;
 	tdvp = ap->a_tdvp;
 	tvp = ap->a_tvp;
 	tcnp = ap->a_tcnp;
 	ltdvp = NULLVP;
 	ltvp = NULLVP;
 	td = curthread;
 	rfdvp = fdvp;
 	rfvp = fvp;
 	rtdvp = tdvp;
 	rtvp = tvp;
 	needrelookup = 0;
 
 #ifdef DIAGNOSTIC
 	if (!(fcnp->cn_flags & HASBUF) || !(tcnp->cn_flags & HASBUF))
 		panic("unionfs_rename: no name");
 #endif
 
 	/* check for cross device rename */
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp != NULLVP && fvp->v_mount != tvp->v_mount)) {
 		if (fvp->v_op != &unionfs_vnodeops)
 			error = ENODEV;
 		else
 			error = EXDEV;
 		goto unionfs_rename_abort;
 	}
 
 	/* Renaming a file to itself has no effect. */
 	if (fvp == tvp)
 		goto unionfs_rename_abort;
 
 	/*
 	 * from/to vnode is unionfs node.
 	 */
 
 	KASSERT_UNIONFS_VNODE(fdvp);
 	KASSERT_UNIONFS_VNODE(fvp);
 	KASSERT_UNIONFS_VNODE(tdvp);
 	if (tvp != NULLVP)
 		KASSERT_UNIONFS_VNODE(tvp);
 
 	unp = VTOUNIONFS(fdvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("fdvp=%p, ufdvp=%p, lfdvp=%p\n", fdvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	if (unp->un_uppervp == NULLVP) {
 		error = ENODEV;
 		goto unionfs_rename_abort;
 	}
 	rfdvp = unp->un_uppervp;
 	vref(rfdvp);
 
 	unp = VTOUNIONFS(fvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("fvp=%p, ufvp=%p, lfvp=%p\n", fvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	ump = MOUNTTOUNIONFSMOUNT(fvp->v_mount);
 	if (unp->un_uppervp == NULLVP) {
 		switch (fvp->v_type) {
 		case VREG:
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_copyfile(unp, 1, fcnp->cn_cred, td);
 			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
 		case VDIR:
 			if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 				goto unionfs_rename_abort;
 			error = unionfs_mkshadowdir(ump, rfdvp, unp, fcnp, td);
 			VOP_UNLOCK(fvp, LK_RELEASE);
 			if (error != 0)
 				goto unionfs_rename_abort;
 			break;
 		default:
 			error = ENODEV;
 			goto unionfs_rename_abort;
 		}
 
 		needrelookup = 1;
 	}
 
 	if (unp->un_lowervp != NULLVP)
 		fcnp->cn_flags |= DOWHITEOUT;
 	rfvp = unp->un_uppervp;
 	vref(rfvp);
 
 	unp = VTOUNIONFS(tdvp);
 #ifdef UNIONFS_IDBG_RENAME
 	UNIONFS_INTERNAL_DEBUG("tdvp=%p, utdvp=%p, ltdvp=%p\n", tdvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 	if (unp->un_uppervp == NULLVP) {
 		error = ENODEV;
 		goto unionfs_rename_abort;
 	}
 	rtdvp = unp->un_uppervp;
 	ltdvp = unp->un_lowervp;
 	vref(rtdvp);
 
 	if (tdvp == tvp) {
 		rtvp = rtdvp;
 		vref(rtvp);
 	} else if (tvp != NULLVP) {
 		unp = VTOUNIONFS(tvp);
 #ifdef UNIONFS_IDBG_RENAME
 		UNIONFS_INTERNAL_DEBUG("tvp=%p, utvp=%p, ltvp=%p\n", tvp, unp->un_uppervp, unp->un_lowervp);
 #endif
 		if (unp->un_uppervp == NULLVP)
 			rtvp = NULLVP;
 		else {
 			if (tvp->v_type == VDIR) {
 				error = EINVAL;
 				goto unionfs_rename_abort;
 			}
 			rtvp = unp->un_uppervp;
 			ltvp = unp->un_lowervp;
 			vref(rtvp);
 		}
 	}
 
 	if (rfvp == rtvp)
 		goto unionfs_rename_abort;
 
 	if (needrelookup != 0) {
 		if ((error = vn_lock(fdvp, LK_EXCLUSIVE)) != 0)
 			goto unionfs_rename_abort;
 		error = unionfs_relookup_for_delete(fdvp, fcnp, td);
 		VOP_UNLOCK(fdvp, LK_RELEASE);
 		if (error != 0)
 			goto unionfs_rename_abort;
 
 		/* Locke of tvp is canceled in order to avoid recursive lock. */
 		if (tvp != NULLVP && tvp != tdvp)
 			VOP_UNLOCK(tvp, LK_RELEASE);
 		error = unionfs_relookup_for_rename(tdvp, tcnp, td);
 		if (tvp != NULLVP && tvp != tdvp)
 			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
 		if (error != 0)
 			goto unionfs_rename_abort;
 	}
 
 	error = VOP_RENAME(rfdvp, rfvp, fcnp, rtdvp, rtvp, tcnp);
 
 	if (error == 0) {
 		if (rtvp != NULLVP && rtvp->v_type == VDIR)
 			cache_purge(tdvp);
 		if (fvp->v_type == VDIR && fdvp != tdvp)
 			cache_purge(fdvp);
 	}
 
 	if (ltdvp != NULLVP)
 		VOP_UNLOCK(ltdvp, LK_RELEASE);
 	if (tdvp != rtdvp)
 		vrele(tdvp);
 	if (ltvp != NULLVP)
 		VOP_UNLOCK(ltvp, LK_RELEASE);
 	if (tvp != rtvp && tvp != NULLVP) {
 		if (rtvp == NULLVP)
 			vput(tvp);
 		else
 			vrele(tvp);
 	}
 	if (fdvp != rfdvp)
 		vrele(fdvp);
 	if (fvp != rfvp)
 		vrele(fvp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error);
 
 	return (error);
 
 unionfs_rename_abort:
 	vput(tdvp);
 	if (tdvp != rtdvp)
 		vrele(rtdvp);
 	if (tvp != NULLVP) {
 		if (tdvp != tvp)
 			vput(tvp);
 		else
 			vrele(tvp);
 	}
 	if (tvp != rtvp && rtvp != NULLVP)
 		vrele(rtvp);
 	if (fdvp != rfdvp)
 		vrele(rfdvp);
 	if (fvp != rfvp)
 		vrele(rfvp);
 	vrele(fdvp);
 	vrele(fvp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rename: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_mkdir(struct vop_mkdir_args *ap)
 {
 	int		error;
 	int		lkflags;
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vattr	va;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	error = EROFS;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	lkflags = cnp->cn_lkflags;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 
 	if (udvp != NULLVP) {
 		/* check opaque */
 		if (!(cnp->cn_flags & ISWHITEOUT)) {
 			error = VOP_GETATTR(udvp, &va, cnp->cn_cred);
 			if (error != 0)
 				return (error);
 			if (va.va_flags & OPAQUE) 
 				cnp->cn_flags |= ISWHITEOUT;
 		}
 
 		if ((error = VOP_MKDIR(udvp, &uvp, cnp, ap->a_vap)) == 0) {
 			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			cnp->cn_lkflags = lkflags;
 			vrele(uvp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_mkdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_rmdir(struct vop_rmdir_args *ap)
 {
 	int		error;
 	struct unionfs_node *dunp;
 	struct unionfs_node *unp;
 	struct unionfs_mount *ump;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	unp = VTOUNIONFS(ap->a_vp);
 	cnp = ap->a_cnp;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 
 	if (udvp == NULLVP)
 		return (EROFS);
 
 	if (udvp == uvp)
 		return (EOPNOTSUPP);
 
 	if (uvp != NULLVP) {
 		if (lvp != NULLVP) {
 			error = unionfs_check_rmdir(ap->a_vp, cnp->cn_cred, td);
 			if (error != 0)
 				return (error);
 		}
 		ump = MOUNTTOUNIONFSMOUNT(ap->a_vp->v_mount);
 		if (ump->um_whitemode == UNIONFS_WHITE_ALWAYS || lvp != NULLVP)
 			cnp->cn_flags |= DOWHITEOUT;
 		error = unionfs_relookup_for_delete(ap->a_dvp, cnp, td);
 		if (!error)
 			error = VOP_RMDIR(udvp, uvp, cnp);
 	}
 	else if (lvp != NULLVP)
 		error = unionfs_mkwhiteout(udvp, cnp, td, unp->un_path);
 
 	if (error == 0) {
 		cache_purge(ap->a_dvp);
 		cache_purge(ap->a_vp);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_rmdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_symlink(struct vop_symlink_args *ap)
 {
 	int		error;
 	int		lkflags;
 	struct unionfs_node *dunp;
 	struct componentname *cnp;
 	struct thread  *td;
 	struct vnode   *udvp;
 	struct vnode   *uvp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_symlink: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_dvp);
 
 	error = EROFS;
 	dunp = VTOUNIONFS(ap->a_dvp);
 	cnp = ap->a_cnp;
 	lkflags = cnp->cn_lkflags;
 	td = curthread;
 	udvp = dunp->un_uppervp;
 
 	if (udvp != NULLVP) {
 		error = VOP_SYMLINK(udvp, &uvp, cnp, ap->a_vap, ap->a_target);
 		if (error == 0) {
 			VOP_UNLOCK(uvp, LK_RELEASE);
 			cnp->cn_lkflags = LK_EXCLUSIVE;
 			error = unionfs_nodeget(ap->a_dvp->v_mount, uvp, NULLVP,
 			    ap->a_dvp, ap->a_vpp, cnp, td);
 			cnp->cn_lkflags = lkflags;
 			vrele(uvp);
 		}
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_symlink: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_readdir(struct vop_readdir_args *ap)
 {
 	int		error;
 	int		eofflag;
 	int		locked;
 	int		uio_offset_bk;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct uio     *uio;
 	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 	struct vattr    va;
 
 	int		ncookies_bk;
 	u_long         *cookies_bk;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readdir: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	eofflag = 0;
 	locked = 0;
 	uio_offset_bk = 0;
 	uio = ap->a_uio;
 	uvp = NULLVP;
 	lvp = NULLVP;
 	td = uio->uio_td;
 	ncookies_bk = 0;
 	cookies_bk = NULL;
 
 	vp = ap->a_vp;
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	/* check the open count. unionfs needs to open before readdir. */
 	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 		if (vn_lock(vp, LK_UPGRADE) != 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		locked = 1;
 	}
 	unp = VTOUNIONFS(vp);
 	if (unp == NULL)
 		error = EBADF;
 	else {
 		uvp = unp->un_uppervp;
 		lvp = unp->un_lowervp;
 		unionfs_get_node_status(unp, td, &unsp);
 		if ((uvp != NULLVP && unsp->uns_upper_opencnt <= 0) ||
 			(lvp != NULLVP && unsp->uns_lower_opencnt <= 0)) {
 			unionfs_tryrem_node_status(unp, unsp);
 			error = EBADF;
 		}
 	}
 	if (locked)
 		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 	if (error != 0)
 		goto unionfs_readdir_exit;
 
 	/* check opaque */
 	if (uvp != NULLVP && lvp != NULLVP) {
 		if ((error = VOP_GETATTR(uvp, &va, ap->a_cred)) != 0)
 			goto unionfs_readdir_exit;
 		if (va.va_flags & OPAQUE)
 			lvp = NULLVP;
 	}
 
 	/* upper only */
 	if (uvp != NULLVP && lvp == NULLVP) {
 		error = VOP_READDIR(uvp, uio, ap->a_cred, ap->a_eofflag,
 		    ap->a_ncookies, ap->a_cookies);
 		unsp->uns_readdir_status = 0;
 
 		goto unionfs_readdir_exit;
 	}
 
 	/* lower only */
 	if (uvp == NULLVP && lvp != NULLVP) {
 		error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 		    ap->a_ncookies, ap->a_cookies);
 		unsp->uns_readdir_status = 2;
 
 		goto unionfs_readdir_exit;
 	}
 
 	/*
 	 * readdir upper and lower
 	 */
 	KASSERT(uvp != NULLVP, ("unionfs_readdir: null upper vp"));
 	KASSERT(lvp != NULLVP, ("unionfs_readdir: null lower vp"));
 	if (uio->uio_offset == 0)
 		unsp->uns_readdir_status = 0;
 
 	if (unsp->uns_readdir_status == 0) {
 		/* read upper */
 		error = VOP_READDIR(uvp, uio, ap->a_cred, &eofflag,
 				    ap->a_ncookies, ap->a_cookies);
 
 		if (error != 0 || eofflag == 0)
 			goto unionfs_readdir_exit;
 		unsp->uns_readdir_status = 1;
 
 		/*
 		 * UFS(and other FS) needs size of uio_resid larger than
 		 * DIRBLKSIZ.
 		 * size of DIRBLKSIZ equals DEV_BSIZE.
 		 * (see: ufs/ufs/ufs_vnops.c ufs_readdir func , ufs/ufs/dir.h)
 		 */
 		if (uio->uio_resid <= (uio->uio_resid & (DEV_BSIZE -1)))
 			goto unionfs_readdir_exit;
 
 		/*
 		 * Backup cookies.
 		 * It prepares to readdir in lower.
 		 */
 		if (ap->a_ncookies != NULL) {
 			ncookies_bk = *(ap->a_ncookies);
 			*(ap->a_ncookies) = 0;
 		}
 		if (ap->a_cookies != NULL) {
 			cookies_bk = *(ap->a_cookies);
 			*(ap->a_cookies) = NULL;
 		}
 	}
 
 	/* initialize for readdir in lower */
 	if (unsp->uns_readdir_status == 1) {
 		unsp->uns_readdir_status = 2;
 		/*
 		 * Backup uio_offset. See the comment after the
 		 * VOP_READDIR call on the lower layer.
 		 */
 		uio_offset_bk = uio->uio_offset;
 		uio->uio_offset = 0;
 	}
 
 	if (lvp == NULLVP) {
 		error = EBADF;
 		goto unionfs_readdir_exit;
 	}
 	/* read lower */
 	error = VOP_READDIR(lvp, uio, ap->a_cred, ap->a_eofflag,
 			    ap->a_ncookies, ap->a_cookies);
 
 	/*
 	 * We can't return an uio_offset of 0: this would trigger an
 	 * infinite loop, because the next call to unionfs_readdir would
 	 * always restart with the upper layer (uio_offset == 0) and
 	 * always return some data.
 	 *
 	 * This happens when the lower layer root directory is removed.
 	 * (A root directory deleting of unionfs should not be permitted.
 	 *  But current VFS can not do it.)
 	 */
 	if (uio->uio_offset == 0)
 		uio->uio_offset = uio_offset_bk;
 
 	if (cookies_bk != NULL) {
 		/* merge cookies */
 		int		size;
 		u_long         *newcookies, *pos;
 
 		size = *(ap->a_ncookies) + ncookies_bk;
 		newcookies = (u_long *) malloc(size * sizeof(u_long),
 		    M_TEMP, M_WAITOK);
 		pos = newcookies;
 
 		memcpy(pos, cookies_bk, ncookies_bk * sizeof(u_long));
 		pos += ncookies_bk;
 		memcpy(pos, *(ap->a_cookies), *(ap->a_ncookies) * sizeof(u_long));
 		free(cookies_bk, M_TEMP);
 		free(*(ap->a_cookies), M_TEMP);
 		*(ap->a_ncookies) = size;
 		*(ap->a_cookies) = newcookies;
 	}
 
 unionfs_readdir_exit:
 	if (error != 0 && ap->a_eofflag != NULL)
 		*(ap->a_eofflag) = 1;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readdir: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_readlink(struct vop_readlink_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readlink: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_READLINK(vp, ap->a_uio, ap->a_cred);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_readlink: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_getwritemount(struct vop_getwritemount_args *ap)
 {
 	int		error;
 	struct vnode   *uvp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: enter\n");
 
 	error = 0;
 	vp = ap->a_vp;
 
 	if (vp == NULLVP || (vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EACCES);
 
 	KASSERT_UNIONFS_VNODE(vp);
 
 	uvp = UNIONFSVPTOUPPERVP(vp);
 	if (uvp == NULLVP && VREG == vp->v_type)
 		uvp = UNIONFSVPTOUPPERVP(VTOUNIONFS(vp)->un_dvp);
 
 	if (uvp != NULLVP)
 		error = VOP_GETWRITEMOUNT(uvp, ap->a_mpp);
 	else {
 		VI_LOCK(vp);
 		if (vp->v_iflag & VI_FREE)
 			error = EOPNOTSUPP;
 		else
 			error = EACCES;
 		VI_UNLOCK(vp);
 	}
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getwritemount: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_inactive(struct vop_inactive_args *ap)
 {
 	ap->a_vp->v_object = NULL;
 	vrecycle(ap->a_vp);
 	return (0);
 }
 
 static int
 unionfs_reclaim(struct vop_reclaim_args *ap)
 {
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: enter\n"); */
 
 	unionfs_noderem(ap->a_vp, ap->a_td);
 
 	/* UNIONFS_INTERNAL_DEBUG("unionfs_reclaim: leave\n"); */
 
 	return (0);
 }
 
 static int
 unionfs_print(struct vop_print_args *ap)
 {
 	struct unionfs_node *unp;
 	/* struct unionfs_node_status *unsp; */
 
 	unp = VTOUNIONFS(ap->a_vp);
 	/* unionfs_get_node_status(unp, curthread, &unsp); */
 
 	printf("unionfs_vp=%p, uppervp=%p, lowervp=%p\n",
 	    ap->a_vp, unp->un_uppervp, unp->un_lowervp);
 	/*
 	printf("unionfs opencnt: uppervp=%d, lowervp=%d\n",
 	    unsp->uns_upper_opencnt, unsp->uns_lower_opencnt);
 	*/
 
 	if (unp->un_uppervp != NULLVP)
 		vn_printf(unp->un_uppervp, "unionfs: upper ");
 	if (unp->un_lowervp != NULLVP)
 		vn_printf(unp->un_lowervp, "unionfs: lower ");
 
 	return (0);
 }
 
 static int
 unionfs_islocked(struct vop_islocked_args *ap)
 {
 	struct unionfs_node *unp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	if (unp == NULL)
 		return (vop_stdislocked(ap));
 
 	if (unp->un_uppervp != NULLVP)
 		return (VOP_ISLOCKED(unp->un_uppervp));
 	if (unp->un_lowervp != NULLVP)
 		return (VOP_ISLOCKED(unp->un_lowervp));
 	return (vop_stdislocked(ap));
 }
 
 static int
 unionfs_get_llt_revlock(struct vnode *vp, int flags)
 {
 	int revlock;
 
 	revlock = 0;
 
 	switch (flags & LK_TYPE_MASK) {
 	case LK_SHARED:
 		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 			revlock = LK_UPGRADE;
 		else
 			revlock = LK_RELEASE;
 		break;
 	case LK_EXCLUSIVE:
 	case LK_UPGRADE:
 		revlock = LK_RELEASE;
 		break;
 	case LK_DOWNGRADE:
 		revlock = LK_UPGRADE;
 		break;
 	default:
 		break;
 	}
 
 	return (revlock);
 }
 
 /*
  * The state of an acquired lock is adjusted similarly to
  * the time of error generating. 
  * flags: LK_RELEASE or LK_UPGRADE
  */
 static void
 unionfs_revlock(struct vnode *vp, int flags)
 {
 	if (flags & LK_RELEASE)
 		VOP_UNLOCK(vp, flags);
 	else {
 		/* UPGRADE */
 		if (vn_lock(vp, flags) != 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 }
 
 static int
 unionfs_lock(struct vop_lock1_args *ap)
 {
 	int		error;
 	int		flags;
 	int		revlock;
 	int		interlock;
 	int		uhold;
 	struct mount   *mp;
 	struct unionfs_mount *ump;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	interlock = 1;
 	uhold = 0;
 	flags = ap->a_flags;
 	vp = ap->a_vp;
 
 	if (LK_RELEASE == (flags & LK_TYPE_MASK) || !(flags & LK_TYPE_MASK))
 		return (VOP_UNLOCK(vp, flags | LK_RELEASE));
 
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		goto unionfs_lock_null_vnode;
 
 	ump = MOUNTTOUNIONFSMOUNT(mp);
 	unp = VTOUNIONFS(vp);
 	if (ump == NULL || unp == NULL)
 		goto unionfs_lock_null_vnode;
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
 	if ((revlock = unionfs_get_llt_revlock(vp, flags)) == 0)
 		panic("unknown lock type: 0x%x", flags & LK_TYPE_MASK);
 
 	if ((vp->v_iflag & VI_OWEINACT) != 0)
 		flags |= LK_NOWAIT;
 
 	/*
 	 * Sometimes, lower or upper is already exclusive locked.
 	 * (ex. vfs_domount: mounted vnode is already locked.)
 	 */
 	if ((flags & LK_TYPE_MASK) == LK_EXCLUSIVE &&
 	    vp == ump->um_rootvp)
 		flags |= LK_CANRECURSE;
 
 	if (lvp != NULLVP) {
 		if (uvp != NULLVP && flags & LK_UPGRADE) {
 			/* Share Lock is once released and a deadlock is avoided.  */
 			vholdnz(uvp);
 			uhold = 1;
 			VOP_UNLOCK(uvp, LK_RELEASE);
 			unp = VTOUNIONFS(vp);
 			if (unp == NULL) {
 				/* vnode is released. */
 				VI_UNLOCK(vp);
 				VOP_UNLOCK(lvp, LK_RELEASE);
 				vdrop(uvp);
 				return (EBUSY);
 			}
 		}
 		VI_LOCK_FLAGS(lvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		vholdl(lvp);
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_LOCK(lvp, flags);
 
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
 			/* vnode is released. */
 			VI_UNLOCK(vp);
 			if (error == 0)
 				VOP_UNLOCK(lvp, LK_RELEASE);
 			vdrop(lvp);
 			if (uhold != 0)
 				vdrop(uvp);
 			return (vop_stdlock(ap));
 		}
 	}
 
 	if (error == 0 && uvp != NULLVP) {
 		if (uhold && flags & LK_UPGRADE) {
 			flags &= ~LK_TYPE_MASK;
 			flags |= LK_EXCLUSIVE;
 		}
 		VI_LOCK_FLAGS(uvp, MTX_DUPOK);
 		flags |= LK_INTERLOCK;
 		if (uhold == 0) {
 			vholdl(uvp);
 			uhold = 1;
 		}
 
 		VI_UNLOCK(vp);
 		ap->a_flags &= ~LK_INTERLOCK;
 
 		error = VOP_LOCK(uvp, flags);
 
 		VI_LOCK(vp);
 		unp = VTOUNIONFS(vp);
 		if (unp == NULL) {
 			/* vnode is released. */
 			VI_UNLOCK(vp);
 			if (error == 0)
 				VOP_UNLOCK(uvp, LK_RELEASE);
 			vdrop(uvp);
 			if (lvp != NULLVP) {
 				VOP_UNLOCK(lvp, LK_RELEASE);
 				vdrop(lvp);
 			}
 			return (vop_stdlock(ap));
 		}
 		if (error != 0 && lvp != NULLVP) {
 			/* rollback */
 			VI_UNLOCK(vp);
 			unionfs_revlock(lvp, revlock);
 			interlock = 0;
 		}
 	}
 
 	if (interlock)
 		VI_UNLOCK(vp);
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
 		vdrop(uvp);
 
 	return (error);
 
 unionfs_lock_null_vnode:
 	ap->a_flags |= LK_INTERLOCK;
 	return (vop_stdlock(ap));
 }
 
 static int
 unionfs_unlock(struct vop_unlock_args *ap)
 {
 	int		error;
 	int		flags;
 	int		uhold;
 	struct vnode   *vp;
 	struct vnode   *lvp;
 	struct vnode   *uvp;
 	struct unionfs_node *unp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = 0;
 	uhold = 0;
 	flags = ap->a_flags | LK_RELEASE;
 	vp = ap->a_vp;
 
 	unp = VTOUNIONFS(vp);
 	if (unp == NULL)
 		goto unionfs_unlock_null_vnode;
 	lvp = unp->un_lowervp;
 	uvp = unp->un_uppervp;
 
 	if (lvp != NULLVP) {
 		vholdnz(lvp);
 		error = VOP_UNLOCK(lvp, flags);
 	}
 
 	if (error == 0 && uvp != NULLVP) {
 		vholdnz(uvp);
 		uhold = 1;
 		error = VOP_UNLOCK(uvp, flags);
 	}
 
 	if (lvp != NULLVP)
 		vdrop(lvp);
 	if (uhold != 0)
 		vdrop(uvp);
 
 	return error;
 
 unionfs_unlock_null_vnode:
 	return (vop_stdunlock(ap));
 }
 
 static int
 unionfs_pathconf(struct vop_pathconf_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	return (VOP_PATHCONF(vp, ap->a_name, ap->a_retval));
 }
 
 static int
 unionfs_advlock(struct vop_advlock_args *ap)
 {
 	int error;
 	struct unionfs_node *unp;
 	struct unionfs_node_status *unsp;
 	struct vnode   *vp;
 	struct vnode   *uvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	vp = ap->a_vp;
 	td = curthread;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 
 	if (uvp == NULLVP) {
 		error = unionfs_copyfile(unp, 1, td->td_ucred, td);
 		if (error != 0)
 			goto unionfs_advlock_abort;
 		uvp = unp->un_uppervp;
 
 		unionfs_get_node_status(unp, td, &unsp);
 		if (unsp->uns_lower_opencnt > 0) {
 			/* try reopen the vnode */
 			error = VOP_OPEN(uvp, unsp->uns_lower_openmode,
 				td->td_ucred, td, NULL);
 			if (error)
 				goto unionfs_advlock_abort;
 			unsp->uns_upper_opencnt++;
 			VOP_CLOSE(unp->un_lowervp, unsp->uns_lower_openmode, td->td_ucred, td);
 			unsp->uns_lower_opencnt--;
 		} else
 			unionfs_tryrem_node_status(unp, unsp);
 	}
 
 	VOP_UNLOCK(vp, LK_RELEASE);
 
 	error = VOP_ADVLOCK(uvp, ap->a_id, ap->a_op, ap->a_fl, ap->a_flags);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
 	return error;
 
 unionfs_advlock_abort:
 	VOP_UNLOCK(vp, LK_RELEASE);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_advlock: leave (%d)\n", error);
 
 	return error;
 }
 
 static int
 unionfs_strategy(struct vop_strategy_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 #ifdef DIAGNOSTIC
 	if (vp == NULLVP)
 		panic("unionfs_strategy: nullvp");
 
 	if (ap->a_bp->b_iocmd == BIO_WRITE && vp == unp->un_lowervp)
 		panic("unionfs_strategy: writing to lowervp");
 #endif
 
 	return (VOP_STRATEGY(vp, ap->a_bp));
 }
 
 static int
 unionfs_getacl(struct vop_getacl_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getacl: enter\n");
 
 	error = VOP_GETACL(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_getacl: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_setacl(struct vop_setacl_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setacl: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETACL(uvp, ap->a_type, ap->a_aclp, ap->a_cred, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setacl: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_aclcheck(struct vop_aclcheck_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	error = VOP_ACLCHECK(vp, ap->a_type, ap->a_aclp, ap->a_cred, ap->a_td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_aclcheck: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_openextattr(struct vop_openextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *tvp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	tvp = (unp->un_uppervp != NULLVP ? unp->un_uppervp : unp->un_lowervp);
 
 	if ((tvp == unp->un_uppervp && (unp->un_flag & UNIONFS_OPENEXTU)) ||
 	    (tvp == unp->un_lowervp && (unp->un_flag & UNIONFS_OPENEXTL)))
 		return (EBUSY);
 
 	error = VOP_OPENEXTATTR(tvp, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
 		if (vn_lock(vp, LK_UPGRADE) != 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag |= UNIONFS_OPENEXTU;
 		else
 			unp->un_flag |= UNIONFS_OPENEXTL;
 		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 	}
 
 	return (error);
 }
 
 static int
 unionfs_closeextattr(struct vop_closeextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 	struct vnode   *tvp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	tvp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		tvp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		tvp = unp->un_lowervp;
 
 	if (tvp == NULLVP)
 		return (EOPNOTSUPP);
 
 	error = VOP_CLOSEEXTATTR(tvp, ap->a_commit, ap->a_cred, ap->a_td);
 
 	if (error == 0) {
 		if (vn_lock(vp, LK_UPGRADE) != 0)
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (tvp == unp->un_uppervp)
 			unp->un_flag &= ~UNIONFS_OPENEXTU;
 		else
 			unp->un_flag &= ~UNIONFS_OPENEXTL;
 		vn_lock(vp, LK_DOWNGRADE | LK_RETRY);
 	}
 
 	return (error);
 }
 
 static int
 unionfs_getextattr(struct vop_getextattr_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		vp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		vp = unp->un_lowervp;
 
 	if (vp == NULLVP)
 		return (EOPNOTSUPP);
 
 	return (VOP_GETEXTATTR(vp, ap->a_attrnamespace, ap->a_name,
 	    ap->a_uio, ap->a_size, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_setextattr(struct vop_setextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *ovp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	ovp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: enter (un_flag=%x)\n", unp->un_flag);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		ovp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		ovp = unp->un_lowervp;
 
 	if (ovp == NULLVP)
 		return (EOPNOTSUPP);
 
 	if (ovp == lvp && lvp->v_type == VREG) {
 		VOP_CLOSEEXTATTR(lvp, 0, cred, td);
 		if (uvp == NULLVP &&
 		    (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
 unionfs_setextattr_reopen:
 			if ((unp->un_flag & UNIONFS_OPENEXTL) &&
 			    VOP_OPENEXTATTR(lvp, cred, td)) {
 #ifdef DIAGNOSTIC
 				panic("unionfs: VOP_OPENEXTATTR failed");
 #endif
 				unp->un_flag &= ~UNIONFS_OPENEXTL;
 			}
 			goto unionfs_setextattr_abort;
 		}
 		uvp = unp->un_uppervp;
 		if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0)
 			goto unionfs_setextattr_reopen;
 		unp->un_flag &= ~UNIONFS_OPENEXTL;
 		unp->un_flag |= UNIONFS_OPENEXTU;
 		ovp = uvp;
 	}
 
 	if (ovp == uvp)
 		error = VOP_SETEXTATTR(ovp, ap->a_attrnamespace, ap->a_name,
 		    ap->a_uio, cred, td);
 
 unionfs_setextattr_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_setextattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_listextattr(struct vop_listextattr_args *ap)
 {
 	struct unionfs_node *unp;
 	struct vnode   *vp;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	unp = VTOUNIONFS(ap->a_vp);
 	vp = NULLVP;
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		vp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		vp = unp->un_lowervp;
 
 	if (vp == NULLVP)
 		return (EOPNOTSUPP);
 
 	return (VOP_LISTEXTATTR(vp, ap->a_attrnamespace, ap->a_uio,
 	    ap->a_size, ap->a_cred, ap->a_td));
 }
 
 static int
 unionfs_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct vnode   *ovp;
 	struct ucred   *cred;
 	struct thread  *td;
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	ovp = NULLVP;
 	cred = ap->a_cred;
 	td = ap->a_td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: enter (un_flag=%x)\n", unp->un_flag);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (unp->un_flag & UNIONFS_OPENEXTU)
 		ovp = unp->un_uppervp;
 	else if (unp->un_flag & UNIONFS_OPENEXTL)
 		ovp = unp->un_lowervp;
 
 	if (ovp == NULLVP)
 		return (EOPNOTSUPP);
 
 	if (ovp == lvp && lvp->v_type == VREG) {
 		VOP_CLOSEEXTATTR(lvp, 0, cred, td);
 		if (uvp == NULLVP &&
 		    (error = unionfs_copyfile(unp, 1, cred, td)) != 0) {
 unionfs_deleteextattr_reopen:
 			if ((unp->un_flag & UNIONFS_OPENEXTL) &&
 			    VOP_OPENEXTATTR(lvp, cred, td)) {
 #ifdef DIAGNOSTIC
 				panic("unionfs: VOP_OPENEXTATTR failed");
 #endif
 				unp->un_flag &= ~UNIONFS_OPENEXTL;
 			}
 			goto unionfs_deleteextattr_abort;
 		}
 		uvp = unp->un_uppervp;
 		if ((error = VOP_OPENEXTATTR(uvp, cred, td)) != 0)
 			goto unionfs_deleteextattr_reopen;
 		unp->un_flag &= ~UNIONFS_OPENEXTL;
 		unp->un_flag |= UNIONFS_OPENEXTU;
 		ovp = uvp;
 	}
 
 	if (ovp == uvp)
 		error = VOP_DELETEEXTATTR(ovp, ap->a_attrnamespace, ap->a_name,
 		    ap->a_cred, ap->a_td);
 
 unionfs_deleteextattr_abort:
 	UNIONFS_INTERNAL_DEBUG("unionfs_deleteextattr: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_setlabel(struct vop_setlabel_args *ap)
 {
 	int		error;
 	struct unionfs_node *unp;
 	struct vnode   *uvp;
 	struct vnode   *lvp;
 	struct thread  *td;
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: enter\n");
 
 	KASSERT_UNIONFS_VNODE(ap->a_vp);
 
 	error = EROFS;
 	unp = VTOUNIONFS(ap->a_vp);
 	uvp = unp->un_uppervp;
 	lvp = unp->un_lowervp;
 	td = ap->a_td;
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	if (uvp == NULLVP && lvp->v_type == VREG) {
 		if ((error = unionfs_copyfile(unp, 1, ap->a_cred, td)) != 0)
 			return (error);
 		uvp = unp->un_uppervp;
 	}
 
 	if (uvp != NULLVP)
 		error = VOP_SETLABEL(uvp, ap->a_label, ap->a_cred, td);
 
 	UNIONFS_INTERNAL_DEBUG("unionfs_setlabel: leave (%d)\n", error);
 
 	return (error);
 }
 
 static int
 unionfs_vptofh(struct vop_vptofh_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 static int
 unionfs_add_writecount(struct vop_add_writecount_args *ap)
 {
 	struct vnode *tvp, *vp;
 	struct unionfs_node *unp;
 	int error;
 
 	vp = ap->a_vp;
 	unp = VTOUNIONFS(vp);
 	tvp = unp->un_uppervp != NULL ? unp->un_uppervp : unp->un_lowervp;
 	VI_LOCK(vp);
 	/* text refs are bypassed to lowervp */
 	VNASSERT(vp->v_writecount >= 0, vp, ("wrong null writecount"));
 	VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
 	    ("wrong writecount inc %d", ap->a_inc));
 	if (tvp != NULL)
 		error = VOP_ADD_WRITECOUNT(tvp, ap->a_inc);
 	else if (vp->v_writecount < 0)
 		error = ETXTBSY;
 	else
 		error = 0;
 	if (error == 0)
 		vp->v_writecount += ap->a_inc;
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 struct vop_vector unionfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		unionfs_access,
 	.vop_aclcheck =		unionfs_aclcheck,
 	.vop_advlock =		unionfs_advlock,
 	.vop_bmap =		VOP_EOPNOTSUPP,
 	.vop_cachedlookup =	unionfs_lookup,
 	.vop_close =		unionfs_close,
 	.vop_closeextattr =	unionfs_closeextattr,
 	.vop_create =		unionfs_create,
 	.vop_deleteextattr =	unionfs_deleteextattr,
 	.vop_fsync =		unionfs_fsync,
 	.vop_getacl =		unionfs_getacl,
 	.vop_getattr =		unionfs_getattr,
 	.vop_getextattr =	unionfs_getextattr,
 	.vop_getwritemount =	unionfs_getwritemount,
 	.vop_inactive =		unionfs_inactive,
 	.vop_need_inactive =	vop_stdneed_inactive,
 	.vop_islocked =		unionfs_islocked,
 	.vop_ioctl =		unionfs_ioctl,
 	.vop_link =		unionfs_link,
 	.vop_listextattr =	unionfs_listextattr,
 	.vop_lock1 =		unionfs_lock,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		unionfs_mkdir,
 	.vop_mknod =		unionfs_mknod,
 	.vop_open =		unionfs_open,
 	.vop_openextattr =	unionfs_openextattr,
 	.vop_pathconf =		unionfs_pathconf,
 	.vop_poll =		unionfs_poll,
 	.vop_print =		unionfs_print,
 	.vop_read =		unionfs_read,
 	.vop_readdir =		unionfs_readdir,
 	.vop_readlink =		unionfs_readlink,
 	.vop_reclaim =		unionfs_reclaim,
 	.vop_remove =		unionfs_remove,
 	.vop_rename =		unionfs_rename,
 	.vop_rmdir =		unionfs_rmdir,
 	.vop_setacl =		unionfs_setacl,
 	.vop_setattr =		unionfs_setattr,
 	.vop_setextattr =	unionfs_setextattr,
 	.vop_setlabel =		unionfs_setlabel,
 	.vop_strategy =		unionfs_strategy,
 	.vop_symlink =		unionfs_symlink,
 	.vop_unlock =		unionfs_unlock,
 	.vop_whiteout =		unionfs_whiteout,
 	.vop_write =		unionfs_write,
 	.vop_vptofh =		unionfs_vptofh,
 	.vop_add_writecount =	unionfs_add_writecount,
 };
+VFS_VOP_VECTOR_REGISTER(unionfs_vnodeops);
Index: head/sys/kern/uipc_mqueue.c
===================================================================
--- head/sys/kern/uipc_mqueue.c	(revision 355789)
+++ head/sys/kern/uipc_mqueue.c	(revision 355790)
@@ -1,2943 +1,2944 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_add_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp, 0);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 	
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
 	    vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		dirent_terminate(&entry);
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	const struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	int found;
 
 	found = 0;
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0)
 			found = 1;
 	}
 	if (!found) {
 		/*
 		 * No jails are rooted in this directory anymore,
 		 * so no queues should be either.
 		 */
 		sx_xlock(&mqfs_data.mi_lock);
 		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 		    mn_sibling, tpn) {
 			if (pn->mn_pr_root == pr->pr_root)
 				(void)do_unlink(pn, curthread->td_ucred);
 		}
 		sx_xunlock(&mqfs_data.mi_lock);
 	}
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 	
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	/*
 	 * "." and ".." are magic directories, populated on the fly, and cannot
 	 * be opened as queues.
 	 */
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 				    pn->mn_gid, accmode, td->td_ucred, NULL);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
  
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred, NULL);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
+VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops);
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
Index: head/sys/kern/vfs_default.c
===================================================================
--- head/sys/kern/vfs_default.c	(revision 355789)
+++ head/sys/kern/vfs_default.c	(revision 355790)
@@ -1,1472 +1,1473 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed
  * to Berkeley by John Heidemann of the UCLA Ficus project.
  *
  * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/rwlock.h>
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/poll.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 static int	vop_nolookup(struct vop_lookup_args *);
 static int	vop_norename(struct vop_rename_args *);
 static int	vop_nostrategy(struct vop_strategy_args *);
 static int	get_next_dirent(struct vnode *vp, struct dirent **dpp,
 				char *dirbuf, int dirbuflen, off_t *off,
 				char **cpos, int *len, int *eofflag,
 				struct thread *td);
 static int	dirent_exists(struct vnode *vp, const char *dirname,
 			      struct thread *td);
 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
 static int vop_stdis_text(struct vop_is_text_args *ap);
 static int vop_stdunset_text(struct vop_unset_text_args *ap);
 static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
 static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
 static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
 static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
 
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
  * implement a particular VOP.
  *
  * If there is no specific entry here, we will return EOPNOTSUPP.
  *
  * Note that every filesystem has to implement either vop_access
  * or vop_accessx; failing to do so will result in immediate crash
  * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(),
  * which calls vop_stdaccess() etc.
  */
 
 struct vop_vector default_vnodeops = {
 	.vop_default =		NULL,
 	.vop_bypass =		VOP_EOPNOTSUPP,
 
 	.vop_access =		vop_stdaccess,
 	.vop_accessx =		vop_stdaccessx,
 	.vop_advise =		vop_stdadvise,
 	.vop_advlock =		vop_stdadvlock,
 	.vop_advlockasync =	vop_stdadvlockasync,
 	.vop_advlockpurge =	vop_stdadvlockpurge,
 	.vop_allocate =		vop_stdallocate,
 	.vop_bmap =		vop_stdbmap,
 	.vop_close =		VOP_NULL,
 	.vop_fsync =		VOP_NULL,
 	.vop_fdatasync =	vop_stdfdatasync,
 	.vop_getpages =		vop_stdgetpages,
 	.vop_getpages_async =	vop_stdgetpages_async,
 	.vop_getwritemount = 	vop_stdgetwritemount,
 	.vop_inactive =		VOP_NULL,
 	.vop_need_inactive =	vop_stdneed_inactive,
 	.vop_ioctl =		vop_stdioctl,
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
 	.vop_lock1 =		vop_stdlock,
 	.vop_lookup =		vop_nolookup,
 	.vop_open =		VOP_NULL,
 	.vop_pathconf =		VOP_EINVAL,
 	.vop_poll =		vop_nopoll,
 	.vop_putpages =		vop_stdputpages,
 	.vop_readlink =		VOP_EINVAL,
 	.vop_rename =		vop_norename,
 	.vop_revoke =		VOP_PANIC,
 	.vop_strategy =		vop_nostrategy,
 	.vop_unlock =		vop_stdunlock,
 	.vop_vptocnp =		vop_stdvptocnp,
 	.vop_vptofh =		vop_stdvptofh,
 	.vop_unp_bind =		vop_stdunp_bind,
 	.vop_unp_connect =	vop_stdunp_connect,
 	.vop_unp_detach =	vop_stdunp_detach,
 	.vop_is_text =		vop_stdis_text,
 	.vop_set_text =		vop_stdset_text,
 	.vop_unset_text =	vop_stdunset_text,
 	.vop_add_writecount =	vop_stdadd_writecount,
 	.vop_copy_file_range =	vop_stdcopy_file_range,
 };
+VFS_VOP_VECTOR_REGISTER(default_vnodeops);
 
 /*
  * Series of placeholder functions for various error returns for
  * VOPs.
  */
 
 int
 vop_eopnotsupp(struct vop_generic_args *ap)
 {
 	/*
 	printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name);
 	*/
 
 	return (EOPNOTSUPP);
 }
 
 int
 vop_ebadf(struct vop_generic_args *ap)
 {
 
 	return (EBADF);
 }
 
 int
 vop_enotty(struct vop_generic_args *ap)
 {
 
 	return (ENOTTY);
 }
 
 int
 vop_einval(struct vop_generic_args *ap)
 {
 
 	return (EINVAL);
 }
 
 int
 vop_enoent(struct vop_generic_args *ap)
 {
 
 	return (ENOENT);
 }
 
 int
 vop_null(struct vop_generic_args *ap)
 {
 
 	return (0);
 }
 
 /*
  * Helper function to panic on some bad VOPs in some filesystems.
  */
 int
 vop_panic(struct vop_generic_args *ap)
 {
 
 	panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name);
 }
 
 /*
  * vop_std<something> and vop_no<something> are default functions for use by
  * filesystems that need the "default reasonable" implementation for a
  * particular operation.
  *
  * The documentation for the operations they implement exists (if it exists)
  * in the VOP_<SOMETHING>(9) manpage (all uppercase).
  */
 
 /*
  * Default vop for filesystems that do not support name lookup
  */
 static int
 vop_nolookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 
 	*ap->a_vpp = NULL;
 	return (ENOTDIR);
 }
 
 /*
  * vop_norename:
  *
  * Handle unlock and reference counting for arguments of vop_rename
  * for filesystems that do not implement rename operation.
  */
 static int
 vop_norename(struct vop_rename_args *ap)
 {
 
 	vop_rename_fail(ap);
 	return (EOPNOTSUPP);
 }
 
 /*
  *	vop_nostrategy:
  *
  *	Strategy routine for VFS devices that have none.
  *
  *	BIO_ERROR and B_INVAL must be cleared prior to calling any strategy
  *	routine.  Typically this is done for a BIO_READ strategy call.
  *	Typically B_INVAL is assumed to already be clear prior to a write
  *	and should not be cleared manually unless you just made the buffer
  *	invalid.  BIO_ERROR should be cleared either way.
  */
 
 static int
 vop_nostrategy (struct vop_strategy_args *ap)
 {
 	printf("No strategy for buffer at %p\n", ap->a_bp);
 	vn_printf(ap->a_vp, "vnode ");
 	ap->a_bp->b_ioflags |= BIO_ERROR;
 	ap->a_bp->b_error = EOPNOTSUPP;
 	bufdone(ap->a_bp);
 	return (EOPNOTSUPP);
 }
 
 static int
 get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf,
 		int dirbuflen, off_t *off, char **cpos, int *len,
 		int *eofflag, struct thread *td)
 {
 	int error, reclen;
 	struct uio uio;
 	struct iovec iov;
 	struct dirent *dp;
 
 	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
 	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
 
 	if (*len == 0) {
 		iov.iov_base = dirbuf;
 		iov.iov_len = dirbuflen;
 
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = *off;
 		uio.uio_resid = dirbuflen;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_READ;
 		uio.uio_td = td;
 
 		*eofflag = 0;
 
 #ifdef MAC
 		error = mac_vnode_check_readdir(td->td_ucred, vp);
 		if (error == 0)
 #endif
 			error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag,
 		    		NULL, NULL);
 		if (error)
 			return (error);
 
 		*off = uio.uio_offset;
 
 		*cpos = dirbuf;
 		*len = (dirbuflen - uio.uio_resid);
 
 		if (*len == 0)
 			return (ENOENT);
 	}
 
 	dp = (struct dirent *)(*cpos);
 	reclen = dp->d_reclen;
 	*dpp = dp;
 
 	/* check for malformed directory.. */
 	if (reclen < DIRENT_MINSIZE)
 		return (EINVAL);
 
 	*cpos += reclen;
 	*len -= reclen;
 
 	return (0);
 }
 
 /*
  * Check if a named file exists in a given directory vnode.
  */
 static int
 dirent_exists(struct vnode *vp, const char *dirname, struct thread *td)
 {
 	char *dirbuf, *cpos;
 	int error, eofflag, dirbuflen, len, found;
 	off_t off;
 	struct dirent *dp;
 	struct vattr va;
 
 	KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp));
 	KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp));
 
 	found = 0;
 
 	error = VOP_GETATTR(vp, &va, td->td_ucred);
 	if (error)
 		return (found);
 
 	dirbuflen = DEV_BSIZE;
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	off = 0;
 	len = 0;
 	do {
 		error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off,
 					&cpos, &len, &eofflag, td);
 		if (error)
 			goto out;
 
 		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
 		    strcmp(dp->d_name, dirname) == 0) {
 			found = 1;
 			goto out;
 		}
 	} while (len > 0 || !eofflag);
 
 out:
 	free(dirbuf, M_TEMP);
 	return (found);
 }
 
 int
 vop_stdaccess(struct vop_access_args *ap)
 {
 
 	KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
 	    VAPPEND)) == 0, ("invalid bit in accmode"));
 
 	return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td));
 }
 
 int
 vop_stdaccessx(struct vop_accessx_args *ap)
 {
 	int error;
 	accmode_t accmode = ap->a_accmode;
 
 	error = vfs_unixify_accmode(&accmode);
 	if (error != 0)
 		return (error);
 
 	if (accmode == 0)
 		return (0);
 
 	return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td));
 }
 
 /*
  * Advisory record locking support
  */
 int
 vop_stdadvlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
 	if (ap->a_fl->l_whence == SEEK_END) {
 		/*
 		 * The NFSv4 server must avoid doing a vn_lock() here, since it
 		 * can deadlock the nfsd threads, due to a LOR.  Fortunately
 		 * the NFSv4 server always uses SEEK_SET and this code is
 		 * only required for the SEEK_END case.
 		 */
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
 		VOP_UNLOCK(vp, 0);
 		if (error)
 			return (error);
 	} else
 		vattr.va_size = 0;
 
 	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
 }
 
 int
 vop_stdadvlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
 	if (ap->a_fl->l_whence == SEEK_END) {
 		/* The size argument is only needed for SEEK_END. */
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
 		VOP_UNLOCK(vp, 0);
 		if (error)
 			return (error);
 	} else
 		vattr.va_size = 0;
 
 	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
 }
 
 int
 vop_stdadvlockpurge(struct vop_advlockpurge_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	lf_purgelocks(vp, &vp->v_lockf);
 	return (0);
 }
 
 /*
  * vop_stdpathconf:
  *
  * Standard implementation of POSIX pathconf, to get information about limits
  * for a filesystem.
  * Override per filesystem for the case where the filesystem has smaller
  * limits.
  */
 int
 vop_stdpathconf(ap)
 	struct vop_pathconf_args /* {
 	struct vnode *a_vp;
 	int a_name;
 	int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 		case _PC_ASYNC_IO:
 			*ap->a_retval = _POSIX_ASYNCHRONOUS_IO;
 			return (0);
 		case _PC_PATH_MAX:
 			*ap->a_retval = PATH_MAX;
 			return (0);
 		case _PC_ACL_EXTENDED:
 		case _PC_ACL_NFS4:
 		case _PC_CAP_PRESENT:
 		case _PC_INF_PRESENT:
 		case _PC_MAC_PRESENT:
 			*ap->a_retval = 0;
 			return (0);
 		default:
 			return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Standard lock, unlock and islocked functions.
  */
 int
 vop_stdlock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct mtx *ilk;
 
 	ilk = VI_MTX(vp);
 	return (lockmgr_lock_fast_path(vp->v_vnlock, ap->a_flags,
 	    &ilk->lock_object, ap->a_file, ap->a_line));
 }
 
 /* See above. */
 int
 vop_stdunlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct mtx *ilk;
 
 	ilk = VI_MTX(vp);
 	return (lockmgr_unlock_fast_path(vp->v_vnlock, ap->a_flags,
 	    &ilk->lock_object));
 }
 
 /* See above. */
 int
 vop_stdislocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	return (lockstatus(ap->a_vp->v_vnlock));
 }
 
 /*
  * Variants of the above set.
  *
  * Differences are:
  * - shared locking disablement is not supported
  * - v_vnlock pointer is not honored
  */
 int
 vop_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		char *file;
 		int line;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int flags = ap->a_flags;
 	struct mtx *ilk;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 
 	if (__predict_false((flags & ~(LK_TYPE_MASK | LK_NODDLKTREAT | LK_RETRY)) != 0))
 		goto other;
 
 	switch (flags & LK_TYPE_MASK) {
 	case LK_SHARED:
 		return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line));
 	case LK_EXCLUSIVE:
 		return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line));
 	}
 other:
 	ilk = VI_MTX(vp);
 	return (lockmgr_lock_fast_path(&vp->v_lock, flags,
 	    &ilk->lock_object, ap->a_file, ap->a_line));
 }
 
 int
 vop_unlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 	MPASS(ap->a_flags == 0);
 
 	return (lockmgr_unlock(&vp->v_lock));
 }
 
 int
 vop_islocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	MPASS(vp->v_vnlock == &vp->v_lock);
 
 	return (lockstatus(&vp->v_lock));
 }
 
 /*
  * Return true for select/poll.
  */
 int
 vop_nopoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (poll_no_poll(ap->a_events));
 }
 
 /*
  * Implement poll for local filesystems that support it.
  */
 int
 vop_stdpoll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	if (ap->a_events & ~POLLSTANDARD)
 		return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events));
 	return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 /*
  * Return our mount point, as we will take charge of the writes.
  */
 int
 vop_stdgetwritemount(ap)
 	struct vop_getwritemount_args /* {
 		struct vnode *a_vp;
 		struct mount **a_mpp;
 	} */ *ap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note that having a reference does not prevent forced unmount from
 	 * setting ->v_mount to NULL after the lock gets released. This is of
 	 * no consequence for typical consumers (most notably vn_start_write)
 	 * since in this case the vnode is VIRF_DOOMED. Unmount might have
 	 * progressed far enough that its completion is only delayed by the
 	 * reference obtained here. The consumer only needs to concern itself
 	 * with releasing it.
 	 */
 	vp = ap->a_vp;
 	mp = vp->v_mount;
 	if (mp == NULL) {
 		*(ap->a_mpp) = NULL;
 		return (0);
 	}
 	if (vfs_op_thread_enter(mp)) {
 		if (mp == vp->v_mount) {
 			vfs_mp_count_add_pcpu(mp, ref, 1);
 			vfs_op_thread_exit(mp);
 		} else {
 			vfs_op_thread_exit(mp);
 			mp = NULL;
 		}
 	} else {
 		MNT_ILOCK(mp);
 		if (mp == vp->v_mount) {
 			MNT_REF(mp);
 			MNT_IUNLOCK(mp);
 		} else {
 			MNT_IUNLOCK(mp);
 			mp = NULL;
 		}
 	}
 	*(ap->a_mpp) = mp;
 	return (0);
 }
 
 /*
  * If the file system doesn't implement VOP_BMAP, then return sensible defaults:
  * - Return the vnode's bufobj instead of any underlying device's bufobj
  * - Calculate the physical block number as if there were equal size
  *   consecutive blocks, but
  * - Report no contiguous runs of blocks.
  */
 int
 vop_stdbmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct bufobj **a_bop;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize);
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 	return (0);
 }
 
 int
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	return (vn_fsync_buf(ap->a_vp, ap->a_waitfor));
 }
 
 static int
 vop_stdfdatasync(struct vop_fdatasync_args *ap)
 {
 
 	return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td));
 }
 
 int
 vop_stdfdatasync_buf(struct vop_fdatasync_args *ap)
 {
 
 	return (vn_fsync_buf(ap->a_vp, MNT_WAIT));
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */
 int
 vop_stdgetpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int *a_rbehind;
 		int *a_rahead;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 	    ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL);
 }
 
 static int
 vop_stdgetpages_async(struct vop_getpages_async_args *ap)
 {
 	int error;
 
 	error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead);
 	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
 	return (error);
 }
 
 int
 vop_stdkqfilter(struct vop_kqfilter_args *ap)
 {
 	return vfs_kqfilter(ap);
 }
 
 /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */
 int
 vop_stdputpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 	} */ *ap;
 {
 
 	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
 	     ap->a_sync, ap->a_rtvals);
 }
 
 int
 vop_stdvptofh(struct vop_vptofh_args *ap)
 {
 	return (EOPNOTSUPP);
 }
 
 int
 vop_stdvptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct ucred *cred = ap->a_cred;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	char *dirbuf, *cpos;
 	int i, error, eofflag, dirbuflen, flags, locked, len, covered;
 	off_t off;
 	ino_t fileno;
 	struct vattr va;
 	struct nameidata nd;
 	struct thread *td;
 	struct dirent *dp;
 	struct vnode *mvp;
 
 	i = *buflen;
 	error = 0;
 	covered = 0;
 	td = curthread;
 
 	if (vp->v_type != VDIR)
 		return (ENOENT);
 
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error)
 		return (error);
 
 	VREF(vp);
 	locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
 	    "..", vp, td);
 	flags = FREAD;
 	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL);
 	if (error) {
 		vn_lock(vp, locked | LK_RETRY);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	mvp = *dvp = nd.ni_vp;
 
 	if (vp->v_mount != (*dvp)->v_mount &&
 	    ((*dvp)->v_vflag & VV_ROOT) &&
 	    ((*dvp)->v_mount->mnt_flag & MNT_UNION)) {
 		*dvp = (*dvp)->v_mount->mnt_vnodecovered;
 		VREF(mvp);
 		VOP_UNLOCK(mvp, 0);
 		vn_close(mvp, FREAD, cred, td);
 		VREF(*dvp);
 		vn_lock(*dvp, LK_SHARED | LK_RETRY);
 		covered = 1;
 	}
 
 	fileno = va.va_fileid;
 
 	dirbuflen = DEV_BSIZE;
 	if (dirbuflen < va.va_blocksize)
 		dirbuflen = va.va_blocksize;
 	dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK);
 
 	if ((*dvp)->v_type != VDIR) {
 		error = ENOENT;
 		goto out;
 	}
 
 	off = 0;
 	len = 0;
 	do {
 		/* call VOP_READDIR of parent */
 		error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off,
 					&cpos, &len, &eofflag, td);
 		if (error)
 			goto out;
 
 		if ((dp->d_type != DT_WHT) &&
 		    (dp->d_fileno == fileno)) {
 			if (covered) {
 				VOP_UNLOCK(*dvp, 0);
 				vn_lock(mvp, LK_SHARED | LK_RETRY);
 				if (dirent_exists(mvp, dp->d_name, td)) {
 					error = ENOENT;
 					VOP_UNLOCK(mvp, 0);
 					vn_lock(*dvp, LK_SHARED | LK_RETRY);
 					goto out;
 				}
 				VOP_UNLOCK(mvp, 0);
 				vn_lock(*dvp, LK_SHARED | LK_RETRY);
 			}
 			i -= dp->d_namlen;
 
 			if (i < 0) {
 				error = ENOMEM;
 				goto out;
 			}
 			if (dp->d_namlen == 1 && dp->d_name[0] == '.') {
 				error = ENOENT;
 			} else {
 				bcopy(dp->d_name, buf + i, dp->d_namlen);
 				error = 0;
 			}
 			goto out;
 		}
 	} while (len > 0 || !eofflag);
 	error = ENOENT;
 
 out:
 	free(dirbuf, M_TEMP);
 	if (!error) {
 		*buflen = i;
 		vref(*dvp);
 	}
 	if (covered) {
 		vput(*dvp);
 		vrele(mvp);
 	} else {
 		VOP_UNLOCK(mvp, 0);
 		vn_close(mvp, FREAD, cred, td);
 	}
 	vn_lock(vp, locked | LK_RETRY);
 	return (error);
 }
 
 int
 vop_stdallocate(struct vop_allocate_args *ap)
 {
 #ifdef __notyet__
 	struct statfs *sfs;
 	off_t maxfilesize = 0;
 #endif
 	struct iovec aiov;
 	struct vattr vattr, *vap;
 	struct uio auio;
 	off_t fsize, len, cur, offset;
 	uint8_t *buf;
 	struct thread *td;
 	struct vnode *vp;
 	size_t iosize;
 	int error;
 
 	buf = NULL;
 	error = 0;
 	td = curthread;
 	vap = &vattr;
 	vp = ap->a_vp;
 	len = *ap->a_len;
 	offset = *ap->a_offset;
 
 	error = VOP_GETATTR(vp, vap, td->td_ucred);
 	if (error != 0)
 		goto out;
 	fsize = vap->va_size;
 	iosize = vap->va_blocksize;
 	if (iosize == 0)
 		iosize = BLKDEV_IOSIZE;
 	if (iosize > MAXPHYS)
 		iosize = MAXPHYS;
 	buf = malloc(iosize, M_TEMP, M_WAITOK);
 
 #ifdef __notyet__
 	/*
 	 * Check if the filesystem sets f_maxfilesize; if not use
 	 * VOP_SETATTR to perform the check.
 	 */
 	sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = VFS_STATFS(vp->v_mount, sfs, td);
 	if (error == 0)
 		maxfilesize = sfs->f_maxfilesize;
 	free(sfs, M_STATFS);
 	if (error != 0)
 		goto out;
 	if (maxfilesize) {
 		if (offset > maxfilesize || len > maxfilesize ||
 		    offset + len > maxfilesize) {
 			error = EFBIG;
 			goto out;
 		}
 	} else
 #endif
 	if (offset + len > vap->va_size) {
 		/*
 		 * Test offset + len against the filesystem's maxfilesize.
 		 */
 		VATTR_NULL(vap);
 		vap->va_size = offset + len;
 		error = VOP_SETATTR(vp, vap, td->td_ucred);
 		if (error != 0)
 			goto out;
 		VATTR_NULL(vap);
 		vap->va_size = fsize;
 		error = VOP_SETATTR(vp, vap, td->td_ucred);
 		if (error != 0)
 			goto out;
 	}
 
 	for (;;) {
 		/*
 		 * Read and write back anything below the nominal file
 		 * size.  There's currently no way outside the filesystem
 		 * to know whether this area is sparse or not.
 		 */
 		cur = iosize;
 		if ((offset % iosize) != 0)
 			cur -= (offset % iosize);
 		if (cur > len)
 			cur = len;
 		if (offset < fsize) {
 			aiov.iov_base = buf;
 			aiov.iov_len = cur;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_offset = offset;
 			auio.uio_resid = cur;
 			auio.uio_segflg = UIO_SYSSPACE;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = td;
 			error = VOP_READ(vp, &auio, 0, td->td_ucred);
 			if (error != 0)
 				break;
 			if (auio.uio_resid > 0) {
 				bzero(buf + cur - auio.uio_resid,
 				    auio.uio_resid);
 			}
 		} else {
 			bzero(buf, cur);
 		}
 
 		aiov.iov_base = buf;
 		aiov.iov_len = cur;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = offset;
 		auio.uio_resid = cur;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_WRITE;
 		auio.uio_td = td;
 
 		error = VOP_WRITE(vp, &auio, 0, td->td_ucred);
 		if (error != 0)
 			break;
 
 		len -= cur;
 		offset += cur;
 		if (len == 0)
 			break;
 		if (should_yield())
 			break;
 	}
 
  out:
 	*ap->a_len = len;
 	*ap->a_offset = offset;
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vop_stdadvise(struct vop_advise_args *ap)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	daddr_t startn, endn;
 	off_t bstart, bend, start, end;
 	int bsize, error;
 
 	vp = ap->a_vp;
 	switch (ap->a_advice) {
 	case POSIX_FADV_WILLNEED:
 		/*
 		 * Do nothing for now.  Filesystems should provide a
 		 * custom method which starts an asynchronous read of
 		 * the requested region.
 		 */
 		error = 0;
 		break;
 	case POSIX_FADV_DONTNEED:
 		error = 0;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (VN_IS_DOOMED(vp)) {
 			VOP_UNLOCK(vp, 0);
 			break;
 		}
 
 		/*
 		 * Round to block boundaries (and later possibly further to
 		 * page boundaries).  Applications cannot reasonably be aware  
 		 * of the boundaries, and the rounding must be to expand at
 		 * both extremities to cover enough.  It still doesn't cover
 		 * read-ahead.  For partial blocks, this gives unnecessary
 		 * discarding of buffers but is efficient enough since the
 		 * pages usually remain in VMIO for some time.
 		 */
 		bsize = vp->v_bufobj.bo_bsize;
 		bstart = rounddown(ap->a_start, bsize);
 		bend = roundup(ap->a_end, bsize);
 
 		/*
 		 * Deactivate pages in the specified range from the backing VM
 		 * object.  Pages that are resident in the buffer cache will
 		 * remain wired until their corresponding buffers are released
 		 * below.
 		 */
 		if (vp->v_object != NULL) {
 			start = trunc_page(bstart);
 			end = round_page(bend);
 			VM_OBJECT_RLOCK(vp->v_object);
 			vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start),
 			    OFF_TO_IDX(end));
 			VM_OBJECT_RUNLOCK(vp->v_object);
 		}
 
 		bo = &vp->v_bufobj;
 		BO_RLOCK(bo);
 		startn = bstart / bsize;
 		endn = bend / bsize;
 		error = bnoreuselist(&bo->bo_clean, bo, startn, endn);
 		if (error == 0)
 			error = bnoreuselist(&bo->bo_dirty, bo, startn, endn);
 		BO_RUNLOCK(bo);
 		VOP_UNLOCK(vp, 0);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 vop_stdunp_bind(struct vop_unp_bind_args *ap)
 {
 
 	ap->a_vp->v_unpcb = ap->a_unpcb;
 	return (0);
 }
 
 int
 vop_stdunp_connect(struct vop_unp_connect_args *ap)
 {
 
 	*ap->a_unpcb = ap->a_vp->v_unpcb;
 	return (0);
 }
 
 int
 vop_stdunp_detach(struct vop_unp_detach_args *ap)
 {
 
 	ap->a_vp->v_unpcb = NULL;
 	return (0);
 }
 
 static int
 vop_stdis_text(struct vop_is_text_args *ap)
 {
 
 	return (ap->a_vp->v_writecount < 0);
 }
 
 int
 vop_stdset_text(struct vop_set_text_args *ap)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error;
 
 	vp = ap->a_vp;
 	VI_LOCK(vp);
 	if (vp->v_writecount > 0) {
 		error = ETXTBSY;
 	} else {
 		/*
 		 * If requested by fs, keep a use reference to the
 		 * vnode until the last text reference is released.
 		 */
 		mp = vp->v_mount;
 		if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 &&
 		    vp->v_writecount == 0) {
 			vp->v_iflag |= VI_TEXT_REF;
 			vrefl(vp);
 		}
 
 		vp->v_writecount--;
 		error = 0;
 	}
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 static int
 vop_stdunset_text(struct vop_unset_text_args *ap)
 {
 	struct vnode *vp;
 	int error;
 	bool last;
 
 	vp = ap->a_vp;
 	last = false;
 	VI_LOCK(vp);
 	if (vp->v_writecount < 0) {
 		if ((vp->v_iflag & VI_TEXT_REF) != 0 &&
 		    vp->v_writecount == -1) {
 			last = true;
 			vp->v_iflag &= ~VI_TEXT_REF;
 		}
 		vp->v_writecount++;
 		error = 0;
 	} else {
 		error = EINVAL;
 	}
 	VI_UNLOCK(vp);
 	if (last)
 		vunref(vp);
 	return (error);
 }
 
 static int
 vop_stdadd_writecount(struct vop_add_writecount_args *ap)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = ap->a_vp;
 	VI_LOCK_FLAGS(vp, MTX_DUPOK);
 	if (vp->v_writecount < 0) {
 		error = ETXTBSY;
 	} else {
 		VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp,
 		    ("neg writecount increment %d", ap->a_inc));
 		vp->v_writecount += ap->a_inc;
 		error = 0;
 	}
 	VI_UNLOCK(vp);
 	return (error);
 }
 
 int
 vop_stdneed_inactive(struct vop_need_inactive_args *ap)
 {
 
 	return (1);
 }
 
 int
 vop_stdioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct vattr va;
 	off_t *offp;
 	int error;
 
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 	case FIOSEEKHOLE:
 		vp = ap->a_vp;
 		error = vn_lock(vp, LK_SHARED);
 		if (error != 0)
 			return (EBADF);
 		if (vp->v_type == VREG)
 			error = VOP_GETATTR(vp, &va, ap->a_cred);
 		else
 			error = ENOTTY;
 		if (error == 0) {
 			offp = ap->a_data;
 			if (*offp < 0 || *offp >= va.va_size)
 				error = ENXIO;
 			else if (ap->a_command == FIOSEEKHOLE)
 				*offp = va.va_size;
 		}
 		VOP_UNLOCK(vp, 0);
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	return (error);
 }
 
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
  */
 int
 vfs_stdroot (mp, flags, vpp)
 	struct mount *mp;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdstatfs (mp, sbp)
 	struct mount *mp;
 	struct statfs *sbp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdquotactl (mp, cmds, uid, arg)
 	struct mount *mp;
 	int cmds;
 	uid_t uid;
 	void *arg;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsync(mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 	struct vnode *vp, *mvp;
 	struct thread *td;
 	int error, lockreq, allerror = 0;
 
 	td = curthread;
 	lockreq = LK_EXCLUSIVE | LK_INTERLOCK;
 	if (waitfor != MNT_WAIT)
 		lockreq |= LK_NOWAIT;
 	/*
 	 * Force stale buffer cache information to be flushed.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		if (vp->v_bufobj.bo_dirty.bv_cnt == 0) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		if ((error = vget(vp, lockreq, td)) != 0) {
 			if (error == ENOENT) {
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				goto loop;
 			}
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		vput(vp);
 	}
 	return (allerror);
 }
 
 int
 vfs_stdnosync (mp, waitfor)
 	struct mount *mp;
 	int waitfor;
 {
 
 	return (0);
 }
 
 static int
 vop_stdcopy_file_range(struct vop_copy_file_range_args *ap)
 {
 	int error;
 
 	error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
 	    ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred,
 	    ap->a_outcred, ap->a_fsizetd);
 	return (error);
 }
 
 int
 vfs_stdvget (mp, ino, flags, vpp)
 	struct mount *mp;
 	ino_t ino;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdfhtovp (mp, fhp, flags, vpp)
 	struct mount *mp;
 	struct fid *fhp;
 	int flags;
 	struct vnode **vpp;
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdinit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return (0);
 }
 
 int
 vfs_stduninit (vfsp)
 	struct vfsconf *vfsp;
 {
 
 	return(0);
 }
 
 int
 vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)
 	struct mount *mp;
 	int cmd;
 	struct vnode *filename_vp;
 	int attrnamespace;
 	const char *attrname;
 {
 
 	if (filename_vp != NULL)
 		VOP_UNLOCK(filename_vp, 0);
 	return (EOPNOTSUPP);
 }
 
 int
 vfs_stdsysctl(mp, op, req)
 	struct mount *mp;
 	fsctlop_t op;
 	struct sysctl_req *req;
 {
 
 	return (EOPNOTSUPP);
 }
 
 static vop_bypass_t *
 bp_by_off(struct vop_vector *vop, struct vop_generic_args *a)
 {
 
 	return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset));
 }
 
 int
 vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a)
 {
 	vop_bypass_t *bp;
 	int prev_stops, rc;
 
 	for (; vop != NULL; vop = vop->vop_default) {
 		bp = bp_by_off(vop, a);
 		if (bp != NULL)
 			break;
 
 		/*
 		 * Bypass is not really supported.  It is done for
 		 * fallback to unimplemented vops in the default
 		 * vector.
 		 */
 		bp = vop->vop_bypass;
 		if (bp != NULL)
 			break;
 	}
 	MPASS(bp != NULL);
 
 	prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT);
 	rc = bp(a);
 	sigallowstop(prev_stops);
 	return (rc);
 }
Index: head/sys/kern/vfs_lookup.c
===================================================================
--- head/sys/kern/vfs_lookup.c	(revision 355789)
+++ head/sys/kern/vfs_lookup.c	(revision 355790)
@@ -1,1480 +1,1485 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #define	NAMEI_DIAGNOSTIC 1
 #undef NAMEI_DIAGNOSTIC
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
     "unsigned long");
 SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
 
 /* Allocation zone for namei. */
 uma_zone_t namei_zone;
 
 /* Placeholder vnode for mp traversal. */
 static struct vnode *vp_crossmp;
 
 static int
 crossmp_vop_islocked(struct vop_islocked_args *ap)
 {
 
 	return (LK_SHARED);
 }
 
 static int
 crossmp_vop_lock1(struct vop_lock1_args *ap)
 {
 	struct vnode *vp;
 	struct lock *lk __unused;
 	const char *file __unused;
 	int flags, line __unused;
 
 	vp = ap->a_vp;
 	lk = vp->v_vnlock;
 	flags = ap->a_flags;
 	file = ap->a_file;
 	line = ap->a_line;
 
 	if ((flags & LK_SHARED) == 0)
 		panic("invalid lock request for crossmp");
 
 	WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line,
 	    flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL);
 	WITNESS_LOCK(&lk->lock_object, 0, file, line);
 	if ((flags & LK_INTERLOCK) != 0)
 		VI_UNLOCK(vp);
 	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line);
 	return (0);
 }
 
 static int
 crossmp_vop_unlock(struct vop_unlock_args *ap)
 {
 	struct vnode *vp;
 	struct lock *lk __unused;
 	int flags;
 
 	vp = ap->a_vp;
 	lk = vp->v_vnlock;
 	flags = ap->a_flags;
 
 	if ((flags & LK_INTERLOCK) != 0)
 		VI_UNLOCK(vp);
 	WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE);
 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
 	    LOCK_LINE);
 	return (0);
 }
 
 static struct vop_vector crossmp_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_islocked =		crossmp_vop_islocked,
 	.vop_lock1 =		crossmp_vop_lock1,
 	.vop_unlock =		crossmp_vop_unlock,
 };
+/*
+ * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
+ * gets allocated early. See nameiinit for the direct call below.
+ */
 
 struct nameicap_tracker {
 	struct vnode *dp;
 	TAILQ_ENTRY(nameicap_tracker) nm_link;
 };
 
 /* Zone for cap mode tracker elements used for dotdot capability checks. */
 static uma_zone_t nt_zone;
 
 static void
 nameiinit(void *dummy __unused)
 {
 
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 	nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	vfs_vector_op_register(&crossmp_vnodeops);
 	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
 
 static int lookup_cap_dotdot = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
     &lookup_cap_dotdot, 0,
     "enables \"..\" components in path lookup in capability mode");
 static int lookup_cap_dotdot_nonlocal = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
     &lookup_cap_dotdot_nonlocal, 0,
     "enables \"..\" components in path lookup in capability mode "
     "on non-local mount");
 
 static void
 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
 		return;
 	if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_BENEATH_LATCHED)) ==
 	    NI_LCF_BENEATH_ABS) {
 		MPASS((ndp->ni_lcf & NI_LCF_LATCH) != 0);
 		if (dp != ndp->ni_beneath_latch)
 			return;
 		ndp->ni_lcf |= NI_LCF_BENEATH_LATCHED;
 	}
 	nt = uma_zalloc(nt_zone, M_WAITOK);
 	vhold(dp);
 	nt->dp = dp;
 	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
 }
 
 static void
 nameicap_cleanup(struct nameidata *ndp, bool clean_latch)
 {
 	struct nameicap_tracker *nt, *nt1;
 
 	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
 	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
 	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
 		vdrop(nt->dp);
 		uma_zfree(nt_zone, nt);
 	}
 	if (clean_latch && (ndp->ni_lcf & NI_LCF_LATCH) != 0) {
 		ndp->ni_lcf &= ~NI_LCF_LATCH;
 		vrele(ndp->ni_beneath_latch);
 	}
 }
 
 /*
  * For dotdot lookups in capability mode, only allow the component
  * lookup to succeed if the resulting directory was already traversed
  * during the operation.  Also fail dotdot lookups for non-local
  * filesystems, where external agents might assist local lookups to
  * escape the compartment.
  */
 static int
 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
 {
 	struct nameicap_tracker *nt;
 	struct mount *mp;
 
 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
 	    dp->v_type != VDIR)
 		return (0);
 	mp = dp->v_mount;
 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
 	    (mp->mnt_flag & MNT_LOCAL) == 0)
 		return (ENOTCAPABLE);
 	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
 	    nm_link) {
 		if (dp == nt->dp)
 			return (0);
 	}
 	if ((ndp->ni_lcf & NI_LCF_BENEATH_ABS) != 0) {
 		ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED;
 		nameicap_cleanup(ndp, false);
 		return (0);
 	}
 	return (ENOTCAPABLE);
 }
 
 static void
 namei_cleanup_cnp(struct componentname *cnp)
 {
 
 	uma_zfree(namei_zone, cnp->cn_pnbuf);
 #ifdef DIAGNOSTIC
 	cnp->cn_pnbuf = NULL;
 	cnp->cn_nameptr = NULL;
 #endif
 }
 
 static int
 namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
 {
 	struct componentname *cnp;
 
 	cnp = &ndp->ni_cnd;
 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_CAPFAIL))
 			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 		return (ENOTCAPABLE);
 	}
 	if ((cnp->cn_flags & BENEATH) != 0) {
 		ndp->ni_lcf |= NI_LCF_BENEATH_ABS;
 		ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED;
 		nameicap_cleanup(ndp, false);
 	}
 	while (*(cnp->cn_nameptr) == '/') {
 		cnp->cn_nameptr++;
 		ndp->ni_pathlen--;
 	}
 	*dpp = ndp->ni_rootdir;
 	vrefact(*dpp);
 	return (0);
 }
 
 /*
  * Convert a pathname into a pointer to a locked vnode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
  * Symbolic links are always followed for all other pathname
  * components other than the last.
  *
  * The segflg defines whether the name is to be copied from user
  * space or kernel space.
  *
  * Overall outline of namei:
  *
  *	copy in name
  *	get starting directory
  *	while (!done && !error) {
  *		call lookup to search path.
  *		if symbolic link, massage name in buffer and continue
  *	}
  */
 int
 namei(struct nameidata *ndp)
 {
 	struct filedesc *fdp;	/* pointer to file descriptor state */
 	char *cp;		/* pointer into pathname argument */
 	struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct componentname *cnp;
 	struct thread *td;
 	struct proc *p;
 	cap_rights_t rights;
 	struct filecaps dirfd_caps;
 	struct uio auio;
 	int error, linklen, startdir_used;
 
 	cnp = &ndp->ni_cnd;
 	td = cnp->cn_thread;
 	p = td->td_proc;
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
 	    ("namei: nameiop contaminated with flags"));
 	KASSERT((cnp->cn_flags & OPMASK) == 0,
 	    ("namei: flags contaminated with nameiops"));
 	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
 	    ndp->ni_startdir->v_type == VBAD);
 	fdp = p->p_fd;
 	TAILQ_INIT(&ndp->ni_cap_tracker);
 	ndp->ni_lcf = 0;
 
 	/* We will set this ourselves if we need it. */
 	cnp->cn_flags &= ~TRAILINGSLASH;
 
 	/*
 	 * Get a buffer for the name to be translated, and copy the
 	 * name into the buffer.
 	 */
 	if ((cnp->cn_flags & HASBUF) == 0)
 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
 		    &ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
 		    &ndp->ni_pathlen);
 
 	/*
 	 * Don't allow empty pathnames.
 	 */
 	if (error == 0 && *cnp->cn_pnbuf == '\0')
 		error = ENOENT;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * In capability mode, lookups must be restricted to happen in
 	 * the subtree with the root specified by the file descriptor:
 	 * - The root must be real file descriptor, not the pseudo-descriptor
 	 *   AT_FDCWD.
 	 * - The passed path must be relative and not absolute.
 	 * - If lookup_cap_dotdot is disabled, path must not contain the
 	 *   '..' components.
 	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
 	 *   components lookups result in the directories which were
 	 *   previously walked by us, which prevents an escape from
 	 *   the relative root.
 	 */
 	if (error == 0 && IN_CAPABILITY_MODE(td) &&
 	    (cnp->cn_flags & NOCAPCHECK) == 0) {
 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 		if (ndp->ni_dirfd == AT_FDCWD) {
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			error = ECAPMODE;
 		}
 	}
 #endif
 	if (error != 0) {
 		namei_cleanup_cnp(cnp);
 		ndp->ni_vp = NULL;
 		return (error);
 	}
 	ndp->ni_loopcnt = 0;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_NAMEI)) {
 		KASSERT(cnp->cn_thread == curthread,
 		    ("namei not using curthread"));
 		ktrnamei(cnp->cn_pnbuf);
 	}
 #endif
 	/*
 	 * Get starting point for the translation.
 	 */
 	FILEDESC_SLOCK(fdp);
 	ndp->ni_rootdir = fdp->fd_rdir;
 	vrefact(ndp->ni_rootdir);
 	ndp->ni_topdir = fdp->fd_jdir;
 
 	/*
 	 * If we are auditing the kernel pathname, save the user pathname.
 	 */
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf);
 	if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf);
 
 	startdir_used = 0;
 	dp = NULL;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	if (cnp->cn_pnbuf[0] == '/') {
 		ndp->ni_resflags |= NIRES_ABS;
 		error = namei_handle_root(ndp, &dp);
 	} else {
 		if (ndp->ni_startdir != NULL) {
 			dp = ndp->ni_startdir;
 			startdir_used = 1;
 		} else if (ndp->ni_dirfd == AT_FDCWD) {
 			dp = fdp->fd_cdir;
 			vrefact(dp);
 		} else {
 			rights = ndp->ni_rightsneeded;
 			cap_rights_set(&rights, CAP_LOOKUP);
 
 			if (cnp->cn_flags & AUDITVNODE1)
 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
 			if (cnp->cn_flags & AUDITVNODE2)
 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
 			error = fgetvp_rights(td, ndp->ni_dirfd,
 			    &rights, &ndp->ni_filecaps, &dp);
 			if (error == EINVAL)
 				error = ENOTDIR;
 #ifdef CAPABILITIES
 			/*
 			 * If file descriptor doesn't have all rights,
 			 * all lookups relative to it must also be
 			 * strictly relative.
 			 */
 			CAP_ALL(&rights);
 			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
 			    &rights) ||
 			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 			    ndp->ni_filecaps.fc_nioctls != -1) {
 				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 			}
 #endif
 		}
 		if (error == 0 && dp->v_type != VDIR)
 			error = ENOTDIR;
 	}
 	if (error == 0 && (cnp->cn_flags & BENEATH) != 0) {
 		if (ndp->ni_dirfd == AT_FDCWD) {
 			ndp->ni_beneath_latch = fdp->fd_cdir;
 			vrefact(ndp->ni_beneath_latch);
 		} else {
 			rights = ndp->ni_rightsneeded;
 			cap_rights_set(&rights, CAP_LOOKUP);
 			error = fgetvp_rights(td, ndp->ni_dirfd, &rights,
 			    &dirfd_caps, &ndp->ni_beneath_latch);
 			if (error == 0 && dp->v_type != VDIR) {
 				vrele(ndp->ni_beneath_latch);
 				error = ENOTDIR;
 			}
 		}
 		if (error == 0)
 			ndp->ni_lcf |= NI_LCF_LATCH;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	if (ndp->ni_startdir != NULL && !startdir_used)
 		vrele(ndp->ni_startdir);
 	if (error != 0) {
 		if (dp != NULL)
 			vrele(dp);
 		goto out;
 	}
 	MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) !=
 	    NI_LCF_BENEATH_ABS);
 	if (((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
 	    lookup_cap_dotdot != 0) ||
 	    ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
 	    (cnp->cn_flags & BENEATH) != 0))
 		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
 	SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
 	    cnp->cn_flags);
 	for (;;) {
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error != 0)
 			goto out;
 		/*
 		 * If not a symbolic link, we're done.
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			vrele(ndp->ni_rootdir);
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) {
 				namei_cleanup_cnp(cnp);
 			} else
 				cnp->cn_flags |= HASBUF;
 			if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS |
 			    NI_LCF_BENEATH_LATCHED)) == NI_LCF_BENEATH_ABS) {
 				NDFREE(ndp, 0);
 				error = ENOTCAPABLE;
 			}
 			nameicap_cleanup(ndp, true);
 			SDT_PROBE2(vfs, namei, lookup, return, error,
 			    (error == 0 ? ndp->ni_vp : NULL));
 			return (error);
 		}
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			break;
 		}
 #ifdef MAC
 		if ((cnp->cn_flags & NOMACCHECK) == 0) {
 			error = mac_vnode_check_readlink(td->td_ucred,
 			    ndp->ni_vp);
 			if (error != 0)
 				break;
 		}
 #endif
 		if (ndp->ni_pathlen > 1)
 			cp = uma_zalloc(namei_zone, M_WAITOK);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = td;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error != 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENOENT;
 			break;
 		}
 		if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
 			if (ndp->ni_pathlen > 1)
 				uma_zfree(namei_zone, cp);
 			error = ENAMETOOLONG;
 			break;
 		}
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			uma_zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 		vput(ndp->ni_vp);
 		dp = ndp->ni_dvp;
 		/*
 		 * Check if root directory should replace current directory.
 		 */
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
 			error = namei_handle_root(ndp, &dp);
 			if (error != 0)
 				goto out;
 		}
 	}
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
 out:
 	vrele(ndp->ni_rootdir);
 	MPASS(error != 0);
 	namei_cleanup_cnp(cnp);
 	nameicap_cleanup(ndp, true);
 	SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
 	return (error);
 }
 
 static int
 compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags)
 {
 
 	if (mp == NULL || ((lkflags & LK_SHARED) &&
 	    (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) ||
 	    ((cnflags & ISDOTDOT) &&
 	    (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) {
 		lkflags &= ~LK_SHARED;
 		lkflags |= LK_EXCLUSIVE;
 	}
 	lkflags |= LK_NODDLKTREAT;
 	return (lkflags);
 }
 
 static __inline int
 needs_exclusive_leaf(struct mount *mp, int flags)
 {
 
 	/*
 	 * Intermediate nodes can use shared locks, we only need to
 	 * force an exclusive lock for leaf nodes.
 	 */
 	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
 		return (0);
 
 	/* Always use exclusive locks if LOCKSHARED isn't set. */
 	if (!(flags & LOCKSHARED))
 		return (1);
 
 	/*
 	 * For lookups during open(), if the mount point supports
 	 * extended shared operations, then use a shared lock for the
 	 * leaf node, otherwise use an exclusive lock.
 	 */
 	if ((flags & ISOPEN) != 0)
 		return (!MNT_EXTENDED_SHARED(mp));
 
 	/*
 	 * Lookup requests outside of open() that specify LOCKSHARED
 	 * only need a shared lock on the leaf vnode.
 	 */
 	return (0);
 }
 
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
  *
  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
  * The starting directory is taken from ni_startdir. The pathname is
  * descended until done, or a symbolic link is encountered. The variable
  * ni_more is clear if the path is completed; it is set to one if a
  * symbolic link needing interpretation is encountered.
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
  * returned unlocked. Otherwise the parent directory is not returned. If
  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
  *
  * Overall outline of lookup:
  *
  * dirloop:
  *	identify next component of name at ndp->ni_ptr
  *	handle degenerate case where name is null string
  *	if .. and crossing mount points and on mounted filesys, find parent
  *	call VOP_LOOKUP routine for next component name
  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
  *	    component vnode returned in ni_vp (if it exists), locked.
  *	if result vnode is mounted on and crossing mount points,
  *	    find mounted on vnode
  *	if more components of name, do next level at dirloop
  *	return the answer in ni_vp, locked if LOCKLEAF set
  *	    if LOCKPARENT set, return locked parent in ni_dvp
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
 lookup(struct nameidata *ndp)
 {
 	char *cp;			/* pointer into pathname argument */
 	char *prev_ni_next;		/* saved ndp->ni_next */
 	struct vnode *dp = NULL;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	struct prison *pr;
 	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	int relookup = 0;		/* do not consume the path component */
 	struct componentname *cnp = &ndp->ni_cnd;
 	int lkflags_save;
 	int ni_dvp_unlocked;
 	
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	ni_dvp_unlocked = 0;
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE &&
 	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	ndp->ni_dvp = NULL;
 	/*
 	 * We use shared locks until we hit the parent of the last cn then
 	 * we adjust based on the requesting flags.
 	 */
 	cnp->cn_lkflags = LK_SHARED;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
 	vn_lock(dp,
 	    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY,
 	    cnp->cn_flags));
 
 dirloop:
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (cnp->cn_namelen > NAME_MAX) {
 		error = ENAMETOOLONG;
 		goto bad;
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	{ char c = *cp;
 	*cp = '\0';
 	printf("{%s}: ", cnp->cn_nameptr);
 	*cp = c; }
 #endif
 	prev_ni_pathlen = ndp->ni_pathlen;
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
 	prev_ni_next = ndp->ni_next;
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			*ndp->ni_next = '\0';
 			cnp->cn_flags |= TRAILINGSLASH;
 		}
 	}
 	ndp->ni_next = cp;
 
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		cnp->cn_flags |= ISDOTDOT;
 	else
 		cnp->cn_flags &= ~ISDOTDOT;
 	if (*ndp->ni_next == 0)
 		cnp->cn_flags |= ISLASTCN;
 	else
 		cnp->cn_flags &= ~ISLASTCN;
 
 	if ((cnp->cn_flags & ISLASTCN) != 0 &&
 	    cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EINVAL;
 		goto bad;
 	}
 
 	nameicap_tracker_add(ndp, dp);
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (cnp->cn_nameiop != LOOKUP) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (wantparent) {
 			ndp->ni_dvp = dp;
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
 
 		if (cnp->cn_flags & AUDITVNODE1)
 			AUDIT_ARG_VNODE1(dp);
 		else if (cnp->cn_flags & AUDITVNODE2)
 			AUDIT_ARG_VNODE2(dp);
 
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp, 0);
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		goto success;
 	}
 
 	/*
 	 * Handle "..": five special cases.
 	 * 0. If doing a capability lookup and lookup_cap_dotdot is
 	 *    disabled, return ENOTCAPABLE.
 	 * 1. Return an error if this is the last component of
 	 *    the name and the operation is DELETE or RENAME.
 	 * 2. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
 	 *    then ignore it so can't get out.
 	 * 3. If this vnode is the root of a mounted
 	 *    filesystem, then replace it with the
 	 *    vnode which was mounted on so we take the
 	 *    .. in the other filesystem.
 	 * 4. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
 	 *    enabled, return ENOTCAPABLE if the lookup would escape
 	 *    from the initial file descriptor directory.  Checks are
 	 *    done by ensuring that namei() already traversed the
 	 *    result of dotdot lookup.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
 		    == NI_LCF_STRICTRELATIVE) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			error = ENOTCAPABLE;
 			goto bad;
 		}
 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 			error = EINVAL;
 			goto bad;
 		}
 		for (;;) {
 			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
 			     pr = pr->pr_parent)
 				if (dp == pr->pr_root)
 					break;
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
 			    dp == rootvnode ||
 			    pr != NULL ||
 			    ((dp->v_vflag & VV_ROOT) != 0 &&
 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				VREF(dp);
 				goto nextname;
 			}
 			if ((dp->v_vflag & VV_ROOT) == 0)
 				break;
 			if (VN_IS_DOOMED(dp)) {	/* forced unmount */
 				error = ENOENT;
 				goto bad;
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, ISDOTDOT));
 			error = nameicap_check_dotdot(ndp, dp);
 			if (error != 0) {
 #ifdef KTRACE
 				if (KTRPOINT(curthread, KTR_CAPFAIL))
 					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 				goto bad;
 			}
 		}
 	}
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 unionlookup:
 #ifdef MAC
 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
 		error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp,
 		    cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	ASSERT_VOP_LOCKED(dp, "lookup");
 	/*
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
 	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
 		vn_lock(dp, LK_UPGRADE|LK_RETRY);
 	if (VN_IS_DOOMED(dp)) {
 		error = ENOENT;
 		goto bad;
 	}
 	/*
 	 * If we're looking up the last component and we need an exclusive
 	 * lock, adjust our lkflags.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 #ifdef NAMEI_DIAGNOSTIC
 	vn_printf(dp, "lookup in ");
 #endif
 	lkflags_save = cnp->cn_lkflags;
 	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags,
 	    cnp->cn_flags);
 	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
 	cnp->cn_lkflags = lkflags_save;
 	if (error != 0) {
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif
 		if ((error == ENOENT) &&
 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			VREF(dp);
 			vput(tdp);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, cnp->cn_flags));
 			nameicap_tracker_add(ndp, dp);
 			goto unionlookup;
 		}
 
 		if (error == ERELOOKUP) {
 			vref(dp);
 			ndp->ni_vp = dp;
 			error = 0;
 			relookup = 1;
 			goto good;
 		}
 
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * At this point, we know we're at the end of the
 		 * pathname.  If creating / renaming, we can consider
 		 * allowing the file or directory to be created / renamed,
 		 * provided we're not on a read-only filesystem.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* trailing slash only allowed for directories */
 		if ((cnp->cn_flags & TRAILINGSLASH) &&
 		    !(cnp->cn_flags & WILLBEDIR)) {
 			error = ENOENT;
 			goto bad;
 		}
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		goto success;
 	}
 
 good:
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
 	dp = ndp->ni_vp;
 
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted filesystem.
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
 		if (vfs_busy(mp, 0))
 			continue;
 		vput(dp);
 		if (dp != ndp->ni_dvp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vrefact(vp_crossmp);
 		ndp->ni_dvp = vp_crossmp;
 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
 		    cnp->cn_flags), &tdp);
 		vfs_unbusy(mp);
 		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
 			panic("vp_crossmp exclusively locked or reclaimed");
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
 		}
 		ndp->ni_vp = dp = tdp;
 	}
 
 	/*
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
 	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
 	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		if (VN_IS_DOOMED(dp)) {
 			/*
 			 * We can't know whether the directory was mounted with
 			 * NOSYMFOLLOW, so we can't follow safely.
 			 */
 			error = ENOENT;
 			goto bad2;
 		}
 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
 			error = EACCES;
 			goto bad2;
 		}
 		/*
 		 * Symlink code always expects an unlocked dvp.
 		 */
 		if (ndp->ni_dvp != ndp->ni_vp) {
 			VOP_UNLOCK(ndp->ni_dvp, 0);
 			ni_dvp_unlocked = 1;
 		}
 		goto success;
 	}
 
 nextname:
 	/*
 	 * Not a symbolic link that we will follow.  Continue with the
 	 * next component if there is any; otherwise, we're done.
 	 */
 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
 	    ("lookup: invalid path state."));
 	if (relookup) {
 		relookup = 0;
 		ndp->ni_pathlen = prev_ni_pathlen;
 		ndp->ni_next = prev_ni_next;
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	if (cnp->cn_flags & ISDOTDOT) {
 		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
 		if (error != 0) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
 #endif
 			goto bad2;
 		}
 	}
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
 		}
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	/*
 	 * If we're processing a path with a trailing slash,
 	 * check that the end result is a directory.
 	 */
 	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto bad2;
 	}
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	if (cnp->cn_flags & SAVESTART) {
 		ndp->ni_startdir = ndp->ni_dvp;
 		VREF(ndp->ni_startdir);
 	}
 	if (!wantparent) {
 		ni_dvp_unlocked = 2;
 		if (ndp->ni_dvp != dp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
 		VOP_UNLOCK(ndp->ni_dvp, 0);
 		ni_dvp_unlocked = 1;
 	}
 
 	if (cnp->cn_flags & AUDITVNODE1)
 		AUDIT_ARG_VNODE1(dp);
 	else if (cnp->cn_flags & AUDITVNODE2)
 		AUDIT_ARG_VNODE2(dp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0);
 success:
 	/*
 	 * Because of shared lookup we may have the vnode shared locked, but
 	 * the caller may want it to be exclusively locked.
 	 */
 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
 	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
 		vn_lock(dp, LK_UPGRADE | LK_RETRY);
 		if (VN_IS_DOOMED(dp)) {
 			error = ENOENT;
 			goto bad2;
 		}
 	}
 	return (0);
 
 bad2:
 	if (ni_dvp_unlocked != 2) {
 		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 	}
 bad:
 	if (!dpunlocked)
 		vput(dp);
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * relookup - lookup a path name component
  *    Used by lookup to re-acquire things.
  */
 int
 relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct vnode *dp = NULL;		/* the directory we are searching */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 
 	KASSERT(cnp->cn_flags & ISLASTCN,
 	    ("relookup: Not given last component."));
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
 	KASSERT(wantparent, ("relookup: parent not wanted."));
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = dvp;
 	cnp->cn_lkflags = LK_EXCLUSIVE;
 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	printf("{%s}: ", cnp->cn_nameptr);
 #endif
 
 	/*
 	 * Check for "" which represents the root directory after slash
 	 * removal.
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		/*
 		 * Support only LOOKUP for "/" because lookup()
 		 * can't succeed for CREATE, DELETE and RENAME.
 		 */
 		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
 		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
 
 		if (!(cnp->cn_flags & LOCKLEAF))
 			VOP_UNLOCK(dp, 0);
 		*vpp = dp;
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	if (cnp->cn_flags & ISDOTDOT)
 		panic ("relookup: lookup on dot-dot");
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	vn_printf(dp, "search in ");
 #endif
 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
 		KASSERT(*vpp == NULL, ("leaf should be empty"));
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (cnp->cn_flags & SAVESTART)
 			VREF(dvp);
 		if ((cnp->cn_flags & LOCKPARENT) == 0)
 			VOP_UNLOCK(dp, 0);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
 
 	dp = *vpp;
 
 	/*
 	 * Disallow directory write attempts on read-only filesystems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		if (dvp == dp)
 			vrele(dvp);
 		else
 			vput(dvp);
 		error = EROFS;
 		goto bad;
 	}
 	/*
 	 * Set the parent lock/ref state to the requested state.
 	 */
 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) {
 		if (wantparent)
 			VOP_UNLOCK(dvp, 0);
 		else
 			vput(dvp);
 	} else if (!wantparent)
 		vrele(dvp);
 	/*
 	 * Check for symbolic link
 	 */
 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
 	    ("relookup: symlink found.\n"));
 
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
 	
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0);
 	return (0);
 bad:
 	vput(dp);
 	*vpp = NULL;
 	return (error);
 }
 
 void
 NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
     const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
     struct thread *td)
 {
 
 	ndp->ni_cnd.cn_nameiop = op;
 	ndp->ni_cnd.cn_flags = flags;
 	ndp->ni_segflg = segflg;
 	ndp->ni_dirp = namep;
 	ndp->ni_dirfd = dirfd;
 	ndp->ni_startdir = startdir;
 	ndp->ni_resflags = 0;
 	filecaps_init(&ndp->ni_filecaps);
 	ndp->ni_cnd.cn_thread = td;
 	if (rightsp != NULL)
 		ndp->ni_rightsneeded = *rightsp;
 	else
 		cap_rights_init(&ndp->ni_rightsneeded);
 }
 
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
 NDFREE(struct nameidata *ndp, const u_int flags)
 {
 	int unlock_dvp;
 	int unlock_vp;
 
 	unlock_dvp = 0;
 	unlock_vp = 0;
 
 	if (!(flags & NDF_NO_FREE_PNBUF) &&
 	    (ndp->ni_cnd.cn_flags & HASBUF)) {
 		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
 		ndp->ni_cnd.cn_flags &= ~HASBUF;
 	}
 	if (!(flags & NDF_NO_VP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
 		unlock_vp = 1;
 	if (!(flags & NDF_NO_DVP_UNLOCK) &&
 	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
 	    ndp->ni_dvp != ndp->ni_vp)
 		unlock_dvp = 1;
 	if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) {
 		if (unlock_vp) {
 			vput(ndp->ni_vp);
 			unlock_vp = 0;
 		} else
 			vrele(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 	}
 	if (unlock_vp)
 		VOP_UNLOCK(ndp->ni_vp, 0);
 	if (!(flags & NDF_NO_DVP_RELE) &&
 	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
 		if (unlock_dvp) {
 			vput(ndp->ni_dvp);
 			unlock_dvp = 0;
 		} else
 			vrele(ndp->ni_dvp);
 		ndp->ni_dvp = NULL;
 	}
 	if (unlock_dvp)
 		VOP_UNLOCK(ndp->ni_dvp, 0);
 	if (!(flags & NDF_NO_STARTDIR_RELE) &&
 	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
 		vrele(ndp->ni_startdir);
 		ndp->ni_startdir = NULL;
 	}
 }
 
 /*
  * Determine if there is a suitable alternate filename under the specified
  * prefix for the specified path.  If the create flag is set, then the
  * alternate prefix will be used so long as the parent directory exists.
  * This is used by the various compatibility ABIs so that Linux binaries prefer
  * files under /compat/linux for example.  The chosen path (whether under
  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
  * to by pathbuf.  The caller is responsible for free'ing the buffer from
  * the M_TEMP bucket if one is returned.
  */
 int
 kern_alternate_path(struct thread *td, const char *prefix, const char *path,
     enum uio_seg pathseg, char **pathbuf, int create, int dirfd)
 {
 	struct nameidata nd, ndroot;
 	char *ptr, *buf, *cp;
 	size_t len, sz;
 	int error;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pathbuf = buf;
 
 	/* Copy the prefix into the new pathname as a starting point. */
 	len = strlcpy(buf, prefix, MAXPATHLEN);
 	if (len >= MAXPATHLEN) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (EINVAL);
 	}
 	sz = MAXPATHLEN - len;
 	ptr = buf + len;
 
 	/* Append the filename to the prefix. */
 	if (pathseg == UIO_SYSSPACE)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		*pathbuf = NULL;
 		free(buf, M_TEMP);
 		return (error);
 	}
 
 	/* Only use a prefix with absolute pathnames. */
 	if (*ptr != '/') {
 		error = EINVAL;
 		goto keeporig;
 	}
 
 	if (dirfd != AT_FDCWD) {
 		/*
 		 * We want the original because the "prefix" is
 		 * included in the already opened dirfd.
 		 */
 		bcopy(ptr, buf, len);
 		return (0);
 	}
 
 	/*
 	 * We know that there is a / somewhere in this pathname.
 	 * Search backwards for it, to find the file's parent dir
 	 * to see if it exists in the alternate tree. If it does,
 	 * and we want to create a file (cflag is set). We don't
 	 * need to worry about the root comparison in this case.
 	 */
 
 	if (create) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
 		error = namei(&nd);
 		*cp = '/';
 		if (error != 0)
 			goto keeporig;
 	} else {
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td);
 
 		error = namei(&nd);
 		if (error != 0)
 			goto keeporig;
 
 		/*
 		 * We now compare the vnode of the prefix to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
 		    td);
 
 		/* We shouldn't ever get an error from this namei(). */
 		error = namei(&ndroot);
 		if (error == 0) {
 			if (nd.ni_vp == ndroot.ni_vp)
 				error = ENOENT;
 
 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
 			vrele(ndroot.ni_vp);
 		}
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 
 keeporig:
 	/* If there was an error, use the original path name. */
 	if (error)
 		bcopy(ptr, buf, len);
 	return (error);
 }
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 355789)
+++ head/sys/kern/vfs_subr.c	(revision 355790)
@@ -1,6085 +1,6090 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	v_init_counters(struct vnode *);
 static void	v_incr_devcount(struct vnode *);
 static void	v_decr_devcount(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static void	vfs_knl_assert_locked(void *arg);
 static void	vfs_knl_assert_unlocked(void *arg);
 static void	vnlru_return_batches(struct vfsops *mnt_op);
 static void	destroy_vpollinfo(struct vpollinfo *vi);
 static int	v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
 		    daddr_t startlbn, daddr_t endlbn);
 
 /*
  * These fences are intended for cases where some synchronization is
  * needed between access of v_iflags and lockless vnode refcount (v_holdcnt
  * and v_usecount) updates.  Access to v_iflags is generally synchronized
  * by the interlock, but we have some internal assertions that check vnode
  * flags without acquiring the lock.  Thus, these fences are INVARIANTS-only
  * for now.
  */
 #ifdef INVARIANTS
 #define	VNODE_REFCOUNT_FENCE_ACQ()	atomic_thread_fence_acq()
 #define	VNODE_REFCOUNT_FENCE_REL()	atomic_thread_fence_rel()
 #else
 #define	VNODE_REFCOUNT_FENCE_ACQ()
 #define	VNODE_REFCOUNT_FENCE_REL()
 #endif
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
 static counter_u64_t vnodes_created;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
     "Number of vnodes created by getnewvnode");
 
 static u_long mnt_free_list_batch = 128;
 SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
     &mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
  * just ones that are cheap to recycle.  Usually they are for files which
  * have been stat'd but not read; these usually have inode and namecache
  * data attached to them.  This target is the preferred minimum size of a
  * sub-cache consisting mostly of such files. The system balances the size
  * of this sub-cache with its complement to try to prevent either from
  * thrashing while the other is relatively inactive.  The targets express
  * a preference for the best balance.
  *
  * "Above" this target there are 2 further targets (watermarks) related
  * to recyling of free vnodes.  In the best-operating case, the cache is
  * exactly full, the free list has size between vlowat and vhiwat above the
  * free target, and recycling from it and normal use maintains this state.
  * Sometimes the free list is below vlowat or even empty, but this state
  * is even better for immediate use provided the cache is not full.
  * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
  * ones) to reach one of these states.  The watermarks are currently hard-
  * coded as 4% and 9% of the available space higher.  These and the default
  * of 25% for wantfreevnodes are too large if the memory size is large.
  * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
  * whenever vnlru_proc() becomes active.
  */
 static u_long wantfreevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
 static u_long freevnodes;
 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
     &freevnodes, 0, "Number of \"free\" vnodes");
 
 static counter_u64_t recycles_count;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
     "Number of vnodes recycled to meet vnode cache targets");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS,
     &reassignbufcalls, 0, "Number of calls to reassignbuf");
 
 static counter_u64_t free_owe_inact;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
     "Number of times free vnodes kept on active list due to VFS "
     "owing inactivation");
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 static uma_zone_t buf_trie_zone;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 static struct cv sync_wakeup;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
     "Time to delay syncing files (in seconds)");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
     "Time to delay syncing directories (in seconds)");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
     "Time to delay syncing metadata (in seconds)");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
     "Number of times I/O speeded up (rush requests)");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /* Target for maximum number of vnodes. */
 int desiredvnodes;
 static int gapvnodes;		/* gap between wanted and desired */
 static int vhiwat;		/* enough extras after expansion */
 static int vlowat;		/* minimal extras before expansion */
 static int vstir;		/* nonzero to stir non-free vnodes */
 static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
 
 static int
 sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
 {
 	int error, old_desiredvnodes;
 
 	old_desiredvnodes = desiredvnodes;
 	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
 		return (error);
 	if (old_desiredvnodes != desiredvnodes) {
 		wantfreevnodes = desiredvnodes / 4;
 		/* XXX locking seems to be incomplete. */
 		vfs_hash_changesize(desiredvnodes);
 		cache_changesize(desiredvnodes);
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
     sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 static int
 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	char *buf;
 	unsigned long ndflags;
 	int error;
 
 	if (req->newptr == NULL)
 		return (EINVAL);
 	if (req->newlen >= PATH_MAX)
 		return (E2BIG);
 
 	buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		goto out;
 
 	buf[req->newlen] = '\0';
 
 	ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME;
 	NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	if (VN_IS_DOOMED(vp)) {
 		/*
 		 * This vnode is being recycled.  Return != 0 to let the caller
 		 * know that the sysctl had no effect.  Return EAGAIN because a
 		 * subsequent call will likely succeed (since namei will create
 		 * a new vnode if necessary)
 		 */
 		error = EAGAIN;
 		goto putvnode;
 	}
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 putvnode:
 	NDFREE(&nd, 0);
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 	int fd;
 
 	if (req->newptr == NULL)
 		return (EBADF);
 
         error = sysctl_handle_int(oidp, &fd, 0, req);
         if (error != 0)
                 return (error);
 	error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 
 	error = vn_lock(vp, LK_EXCLUSIVE);
 	if (error != 0)
 		goto drop;
 
 	counter_u64_add(recycles_count, 1);
 	vgone(vp);
 	VOP_UNLOCK(vp, 0);
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
     CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
     CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
     sysctl_ftry_reclaim_vnode, "I",
     "Try to reclaim a vnode by its file descriptor");
 
 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
 static int vnsz2log;
 
 /*
  * Support for the bufobj clean & dirty pctrie.
  */
 static void *
 buf_trie_alloc(struct pctrie *ptree)
 {
 
 	return uma_zalloc(buf_trie_zone, M_NOWAIT);
 }
 
 static void
 buf_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(buf_trie_zone, node);
 }
 PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
 
 /*
  * Initialize the vnode management data structures.
  *
  * Reevaluate the following cap on the number of vnodes after the physical
  * memory size exceeds 512GB.  In the limit, as the physical memory size
  * grows, the ratio of the memory size in KB to vnodes approaches 64:1.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
 #endif
 
 /*
  * Initialize a vnode as it first enters the zone.
  */
 static int
 vnode_init(void *mem, int size, int flags)
 {
 	struct vnode *vp;
 
 	vp = mem;
 	bzero(vp, size);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
 	    LK_NOSHARE | LK_IS_VNODE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bufobj_init(&vp->v_bufobj, vp);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Initialize rangelocks.
 	 */
 	rangelock_init(&vp->v_rl);
 	return (0);
 }
 
 /*
  * Free a vnode when it is cleared from the zone.
  */
 static void
 vnode_fini(void *mem, int size)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	vp = mem;
 	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	bo = &vp->v_bufobj;
 	rw_destroy(BO_LOCKPTR(bo));
 }
 
 /*
  * Provide the size of NFS nclnode and NFS fh for calculation of the
  * vnode memory consumption.  The size is specified directly to
  * eliminate dependency on NFS-private header.
  *
  * Other filesystems may use bigger or smaller (like UFS and ZFS)
  * private inode data, but the NFS-based estimation is ample enough.
  * Still, we care about differences in the size between 64- and 32-bit
  * platforms.
  *
  * Namecache structure size is heuristically
  * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
  */
 #ifdef _LP64
 #define	NFS_NCLNODE_SZ	(528 + 64)
 #define	NC_SZ		148
 #else
 #define	NFS_NCLNODE_SZ	(360 + 32)
 #define	NC_SZ		92
 #endif
 
 static void
 vntblinit(void *dummy __unused)
 {
 	u_int i;
 	int physvnodes, virtvnodes;
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and the
 	 * kernel's heap size.  Generally speaking, it scales with the
 	 * physical memory size.  The ratio of desiredvnodes to the physical
 	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
 	 * Thereafter, the
 	 * marginal ratio of desiredvnodes to the physical memory size is
 	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
 	 * size.  The memory required by desiredvnodes vnodes and vm objects
 	 * must not exceed 1/10th of the kernel's heap size.
 	 */
 	physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
 	    3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
 	virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
 	desiredvnodes = min(physvnodes, virtvnodes);
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
 	 * Preallocate enough nodes to support one-per buf so that
 	 * we can not fail an insert.  reassignbuf() callers can not
 	 * tolerate the insertion failure.
 	 */
 	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
 	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
 	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	uma_prealloc(buf_trie_zone, nbuf);
 
 	vnodes_created = counter_u64_alloc(M_WAITOK);
 	recycles_count = counter_u64_alloc(M_WAITOK);
 	free_owe_inact = counter_u64_alloc(M_WAITOK);
 
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
 	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
 		vnsz2log++;
 	vnsz2log--;
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Eventually, mountlist_mtx is not released on failure.
  *
  * vfs_busy() is a custom lock, it can block the caller.
  * vfs_busy() only sleeps if the unmount is active on the mount point.
  * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
  * vnode belonging to mp.
  *
  * Lookup uses vfs_busy() to traverse mount points.
  * root fs			var fs
  * / vnode lock		A	/ vnode lock (/var)		D
  * /var vnode lock	B	/log vnode lock(/var/log)	E
  * vfs_busy lock	C	vfs_busy lock			F
  *
  * Within each file system, the lock order is C->A->B and F->D->E.
  *
  * When traversing across mounts, the system follows that lock order:
  *
  *        C->A->B
  *              |
  *              +->F->D->E
  *
  * The lookup() process for namei("/var") illustrates the process:
  *  VOP_LOOKUP() obtains B while A is held
  *  vfs_busy() obtains a shared lock on F while A and B are held
  *  vput() releases lock on B
  *  vput() releases lock on A
  *  VFS_ROOT() obtains lock on D while shared lock on F is held
  *  vfs_unbusy() releases shared lock on F
  *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
  *    Attempt to lock A (instead of vp_crossmp) while D is held would
  *    violate the global order, causing deadlocks.
  *
  * dounmount() locks B while F is drained.
  */
 int
 vfs_busy(struct mount *mp, int flags)
 {
 
 	MPASS((flags & ~MBF_MASK) == 0);
 	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
 
 	if (vfs_op_thread_enter(mp)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
 		MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
 		vfs_mp_count_add_pcpu(mp, ref, 1);
 		vfs_mp_count_add_pcpu(mp, lockref, 1);
 		vfs_op_thread_exit(mp);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REF(mp);
 	/*
 	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
 	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
 	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
 	 * about to be really destroyed.  vfs_busy needs to release its
 	 * reference on the mount point in this case and return with ENOENT,
 	 * telling the caller that mount mount it tried to busy is no longer
 	 * valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			CTR1(KTR_VFS, "%s: failed busying before sleeping",
 			    __func__);
 			return (ENOENT);
 		}
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_unlock(&mountlist_mtx);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
 		if (flags & MBF_MNTLSTLOCK)
 			mtx_lock(&mountlist_mtx);
 		MNT_ILOCK(mp);
 	}
 	if (flags & MBF_MNTLSTLOCK)
 		mtx_unlock(&mountlist_mtx);
 	mp->mnt_lockref++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp)
 {
 	int c;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if (vfs_op_thread_enter(mp)) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		vfs_mp_count_sub_pcpu(mp, lockref, 1);
 		vfs_mp_count_sub_pcpu(mp, ref, 1);
 		vfs_op_thread_exit(mp);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_lockref;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
 		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
 		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
 		mp->mnt_kern_flag &= ~MNTK_DRAINING;
 		wakeup(&mp->mnt_lockref);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	return ((struct mount *) 0);
 }
 
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
  *
  * To avoid congestion on mountlist_mtx, implement simple direct-mapped
  * cache for popular filesystem identifiers.  The cache is lockess, using
  * the fact that struct mount's are never freed.  In worst case we may
  * get pointer to unmounted or even different filesystem, so we have to
  * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
 #define	FSID_CACHE_SIZE	256
 	typedef struct mount * volatile vmp_t;
 	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
 	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
 	hash = fsid->val[0] ^ fsid->val[1];
 	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
 	mp = cache[hash];
 	if (mp == NULL ||
 	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
 	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
 		goto slow;
 	if (vfs_busy(mp, 0) != 0) {
 		cache[hash] = NULL;
 		goto slow;
 	}
 	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 		return (mp);
 	else
 	    vfs_unbusy(mp);
 
 slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
 				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
 			cache[hash] = mp;
 			return (mp);
 		}
 	}
 	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if (jailed(td->td_ucred)) {
 		/*
 		 * If the jail of the calling thread lacks permission for
 		 * this type of file system, deny immediately.
 		 */
 		if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
 			return (EPERM);
 
 		/*
 		 * If the file system was mounted outside the jail of the
 		 * calling thread, deny immediately.
 		 */
 		if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
 			return (EPERM);
 	}
 
 	/*
 	 * If file system supports delegated administration, we don't check
 	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
 	 * by the file system itself.
 	 * If this is not the user that did original mount, we check for
 	 * the PRIV_VFS_MOUNT_OWNER privilege.
 	 */
 	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static uint16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
     "3+: sec + ns (max. precision))");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  *
  * @param mp		 Try to reclaim vnodes from this mountpoint
  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
  * 			 entries if this argument is strue
  * @param trigger	 Only reclaim vnodes with fewer than this many resident
  *			 pages.
  * @return		 The number of vnodes that were reclaimed.
  */
 static int
 vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
 {
 	struct vnode *vp;
 	int count, done, target;
 
 	done = 0;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize;
 	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
 	target = target / 10 + 1;
 	while (count != 0 && done < target) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		/*
 		 * XXX LRU is completely broken for non-free vnodes.  First
 		 * by calling here in mountpoint order, then by moving
 		 * unselected vnodes to the end here, and most grossly by
 		 * removing the vlruvp() function that was supposed to
 		 * maintain the order.  (This function was born broken
 		 * since syncer problems prevented it doing anything.)  The
 		 * order is closer to LRC (C = Created).
 		 *
 		 * LRU reclaiming of vnodes seems to have last worked in
 		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
 		 * Then there was no hold count, and inactive vnodes were
 		 * simply put on the free list in LRU order.  The separate
 		 * lists also break LRU.  We prefer to reclaim from the
 		 * free list for technical reasons.  This tends to thrash
 		 * the free list to keep very unrecently used held vnodes.
 		 * The problem is mitigated by keeping the free list large.
 		 */
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 * Also skip free vnodes.  We are trying to make space
 		 * to expand the free list, not reduce it.
 		 */
 		if (vp->v_usecount ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    ((vp->v_iflag & VI_FREE) != 0) ||
 		    VN_IS_DOOMED(vp) || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VIRF_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount ||
 		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
 		    (vp->v_iflag & VI_FREE) != 0 ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, 0);
 			vdropl(vp);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT(!VN_IS_DOOMED(vp),
 		    ("VIRF_DOOMED unexpectedly detected in vlrureclaim()"));
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if (!should_yield())
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if (!should_yield())
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		kern_yield(PRI_USER);
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
     0,
     "limit on vnode free requests per call to the vnlru_free routine");
 
 /*
  * Attempt to reduce the free list by the requested amount.
  */
 static void
 vnlru_free_locked(int count, struct vfsops *mnt_op)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	bool tried_batches;
 
 	tried_batches = false;
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (count > max_vnlru_free)
 		count = max_vnlru_free;
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (vp == NULL) {
 			if (tried_batches)
 				break;
 			mtx_unlock(&vnode_free_list_mtx);
 			vnlru_return_batches(mnt_op);
 			tried_batches = true;
 			mtx_lock(&vnode_free_list_mtx);
 			continue;
 		}
 
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		KASSERT((vp->v_iflag & VI_FREE) != 0,
 		    ("Removing vnode not on freelist"));
 		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 		    ("Mangling active vnode"));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 
 		/*
 		 * Don't recycle if our vnode is from different type
 		 * of mount point.  Note that mp is type-safe, the
 		 * check does not reach unmapped address even if
 		 * vnode is reclaimed.
 		 * Don't recycle if we can't get the interlock without
 		 * blocking.
 		 */
 		if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
 		    mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 			continue;
 		}
 		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
 		    vp, ("vp inconsistent on freelist"));
 
 		/*
 		 * The clear of VI_FREE prevents activation of the
 		 * vnode.  There is no sense in putting the vnode on
 		 * the mount point active list, only to remove it
 		 * later during recycling.  Inline the relevant part
 		 * of vholdl(), to avoid triggering assertions or
 		 * activating.
 		 */
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		VNODE_REFCOUNT_FENCE_REL();
 		refcount_acquire(&vp->v_holdcnt);
 
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 
 void
 vnlru_free(int count, struct vfsops *mnt_op)
 {
 
 	mtx_lock(&vnode_free_list_mtx);
 	vnlru_free_locked(count, mnt_op);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 
 /* XXX some names and initialization are bad for limits and watermarks. */
 static int
 vspace(void)
 {
 	int space;
 
 	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
 	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
 	vlowat = vhiwat / 2;
 	if (numvnodes > desiredvnodes)
 		return (0);
 	space = desiredvnodes - numvnodes;
 	if (freevnodes > wantfreevnodes)
 		space += freevnodes - wantfreevnodes;
 	return (space);
 }
 
 static void
 vnlru_return_batch_locked(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	if (mp->mnt_tmpfreevnodelistsize == 0)
 		return;
 
 	TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
 		VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
 		    ("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
 		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
 	freevnodes += mp->mnt_tmpfreevnodelistsize;
 	mtx_unlock(&vnode_free_list_mtx);
 	mp->mnt_tmpfreevnodelistsize = 0;
 }
 
 static void
 vnlru_return_batch(struct mount *mp)
 {
 
 	mtx_lock(&mp->mnt_listmtx);
 	vnlru_return_batch_locked(mp);
 	mtx_unlock(&mp->mnt_listmtx);
 }
 
 static void
 vnlru_return_batches(struct vfsops *mnt_op)
 {
 	struct mount *mp, *nmp;
 	bool need_unbusy;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		need_unbusy = false;
 		if (mnt_op != NULL && mp->mnt_op != mnt_op)
 			goto next;
 		if (mp->mnt_tmpfreevnodelistsize == 0)
 			goto next;
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
 			vnlru_return_batch(mp);
 			need_unbusy = true;
 			mtx_lock(&mountlist_mtx);
 		}
 next:
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		if (need_unbusy)
 			vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	unsigned long onumvnodes;
 	int done, force, trigger, usevnodes, vsp;
 	bool reclaim_nc_src;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
 	    SHUTDOWN_PRI_FIRST);
 
 	force = 0;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
 		mtx_lock(&vnode_free_list_mtx);
 		/*
 		 * If numvnodes is too large (due to desiredvnodes being
 		 * adjusted using its sysctl, or emergency growth), first
 		 * try to reduce it by discarding from the free list.
 		 */
 		if (numvnodes > desiredvnodes)
 			vnlru_free_locked(numvnodes - desiredvnodes, NULL);
 		/*
 		 * Sleep if the vnode cache is in a good state.  This is
 		 * when it is not over-full and has space for about a 4%
 		 * or 9% expansion (by growing its size or inexcessively
 		 * reducing its free list).  Otherwise, try to reclaim
 		 * space for a 10% expansion.
 		 */
 		if (vstir && force == 0) {
 			force = 1;
 			vstir = 0;
 		}
 		vsp = vspace();
 		if (vsp >= vlowat && force == 0) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		onumvnodes = numvnodes;
 		/*
 		 * Calculate parameters for recycling.  These are the same
 		 * throughout the loop to give some semblance of fairness.
 		 * The trigger point is to avoid recycling vnodes with lots
 		 * of resident pages.  We aren't trying to free memory; we
 		 * are trying to recycle or at least free vnodes.
 		 */
 		if (numvnodes <= desiredvnodes)
 			usevnodes = numvnodes - freevnodes;
 		else
 			usevnodes = numvnodes;
 		if (usevnodes <= 0)
 			usevnodes = 1;
 		/*
 		 * The trigger value is is chosen to give a conservatively
 		 * large value to ensure that it alone doesn't prevent
 		 * making progress.  The value can easily be so large that
 		 * it is effectively infinite in some congested and
 		 * misconfigured cases, and this is necessary.  Normally
 		 * it is about 8 to 100 (pages), which is quite large.
 		 */
 		trigger = vm_cnt.v_page_count * 2 / usevnodes;
 		if (force < 2)
 			trigger = vsmalltrigger;
 		reclaim_nc_src = force >= 3;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			done += vlrureclaim(mp, reclaim_nc_src, trigger);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
 			uma_reclaim(UMA_RECLAIM_DRAIN);
 		if (done == 0) {
 			if (force == 0 || force == 1) {
 				force = 2;
 				continue;
 			}
 			if (force == 2) {
 				force = 3;
 				continue;
 			}
 			force = 0;
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			kern_yield(PRI_USER);
 		/*
 		 * After becoming active to expand above low water, keep
 		 * active until above high water.
 		 */
 		vsp = vspace();
 		force = vsp < vhiwat;
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
     &vnlru_kp);
  
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct mount *vnmp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, vp %p lock is already held",
 		    __func__, vp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, cannot start the write for %p",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, 0);
 		VI_UNLOCK(vp);
 		vn_finished_write(vnmp);
 		CTR2(KTR_VFS,
 		    "%s: impossible to recycle, %p is already referenced",
 		    __func__, vp);
 		return (EBUSY);
 	}
 	if (!VN_IS_DOOMED(vp)) {
 		counter_u64_add(recycles_count, 1);
 		vgonel(vp);
 	}
 	VOP_UNLOCK(vp, 0);
 	VI_UNLOCK(vp);
 	vn_finished_write(vnmp);
 	return (0);
 }
 
 static void
 vcheckspace(void)
 {
 	int vsp;
 
 	vsp = vspace();
 	if (vsp < vlowat && vnlruproc_sig == 0) {
 		vnlruproc_sig = 1;
 		wakeup(vnlruproc);
 	}
 }
 
 /*
  * Wait if necessary for space for a new vnode.
  */
 static int
 getnewvnode_wait(int suspended)
 {
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	if (numvnodes >= desiredvnodes) {
 		if (suspended) {
 			/*
 			 * The file system is being suspended.  We cannot
 			 * risk a deadlock here, so allow allocation of
 			 * another vnode even if this would give too many.
 			 */
 			return (0);
 		}
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 	}
 	/* Post-adjust like the pre-adjust in getnewvnode(). */
 	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
 		vnlru_free_locked(1, NULL);
 	return (numvnodes >= desiredvnodes ? ENFILE : 0);
 }
 
 /*
  * This hack is fragile, and probably not needed any more now that the
  * watermark handling works.
  */
 void
 getnewvnode_reserve(u_int count)
 {
 	struct thread *td;
 
 	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
 	/* XXX no longer so quick, but this part is not racy. */
 	mtx_lock(&vnode_free_list_mtx);
 	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
 		vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
 		    freevnodes - wantfreevnodes), NULL);
 	mtx_unlock(&vnode_free_list_mtx);
 
 	td = curthread;
 	/* First try to be quick and racy. */
 	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
 		td->td_vp_reserv += count;
 		vcheckspace();	/* XXX no longer so quick, but more racy */
 		return;
 	} else
 		atomic_subtract_long(&numvnodes, count);
 
 	mtx_lock(&vnode_free_list_mtx);
 	while (count > 0) {
 		if (getnewvnode_wait(0) == 0) {
 			count--;
 			td->td_vp_reserv++;
 			atomic_add_long(&numvnodes, 1);
 		}
 	}
 	vcheckspace();
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * This hack is fragile, especially if desiredvnodes or wantvnodes are
  * misconfgured or changed significantly.  Reducing desiredvnodes below
  * the reserved amount should cause bizarre behaviour like reducing it
  * below the number of active vnodes -- the system will try to reduce
  * numvnodes to match, but should fail, so the subtraction below should
  * not overflow.
  */
 void
 getnewvnode_drop_reserve(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
 	td->td_vp_reserv = 0;
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp;
 	struct thread *td;
 	struct lock_object *lo;
 	static int cyclecount;
 	int error __unused;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
+
+	KASSERT(vops->registered,
+	    ("%s: not registered vector op %p\n", __func__, vops));
+
 	vp = NULL;
 	td = curthread;
 	if (td->td_vp_reserv > 0) {
 		td->td_vp_reserv -= 1;
 		goto alloc;
 	}
 	mtx_lock(&vnode_free_list_mtx);
 	if (numvnodes < desiredvnodes)
 		cyclecount = 0;
 	else if (cyclecount++ >= freevnodes) {
 		cyclecount = 0;
 		vstir = 1;
 	}
 	/*
 	 * Grow the vnode cache if it will not be above its target max
 	 * after growing.  Otherwise, if the free list is nonempty, try
 	 * to reclaim 1 item from it before growing the cache (possibly
 	 * above its target max if the reclamation failed or is delayed).
 	 * Otherwise, wait for some space.  In all cases, schedule
 	 * vnlru_proc() if we are getting short of space.  The watermarks
 	 * should be chosen so that we never wait or even reclaim from
 	 * the free list to below its target minimum.
 	 */
 	if (numvnodes + 1 <= desiredvnodes)
 		;
 	else if (freevnodes > 0)
 		vnlru_free_locked(1, NULL);
 	else {
 		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
 		    MNTK_SUSPEND));
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (error != 0) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (error);
 		}
 #endif
 	}
 	vcheckspace();
 	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
 	counter_u64_add(vnodes_created, 1);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
 	/*
 	 * Locks are given the generic name "vnode" when created.
 	 * Follow the historic practice of using the filesystem
 	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
 	 *
 	 * Locks live in a witness group keyed on their name. Thus,
 	 * when a lock is renamed, it must also move from the witness
 	 * group of its old name to the witness group of its new name.
 	 *
 	 * The change only needs to be made when the vnode moves
 	 * from one filesystem type to another. We ensure that each
 	 * filesystem use a single static name pointer for its tag so
 	 * that we can compare pointers rather than doing a strcmp().
 	 */
 	lo = &vp->v_vnlock->lock_object;
 	if (lo->lo_name != tag) {
 		lo->lo_name = tag;
 		WITNESS_DESTROY(lo);
 		WITNESS_INIT(lo, tag);
 	}
 	/*
 	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
 	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
 	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
 	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_init_counters(vp);
 	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef DIAGNOSTIC
 	if (mp == NULL && vops != &dead_vnodeops)
 		printf("NULL mp in getnewvnode(9), tag %s\n", tag);
 #endif
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_vnode_associate_singlelabel(mp, vp);
 #endif
 	if (mp != NULL) {
 		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
 	 * still initialize v_hash to have vfs_hash_index() useful.
 	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
 	 * its own hashing.
 	 */
 	vp->v_hash = (uintptr_t)vp >> vnsz2log;
 
 	*vpp = vp;
 	return (0);
 }
 
 static void
 freevnode(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	/*
 	 * The vnode has been marked for destruction, so free it.
 	 *
 	 * The vnode will be returned to the zone where it will
 	 * normally remain until it is needed for another vnode. We
 	 * need to cleanup (or verify that the cleanup has already
 	 * been done) any residual data left from its current use
 	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
 	atomic_subtract_long(&numvnodes, 1);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
 	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
 	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
 	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
 	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
 		vp->v_pollinfo = NULL;
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	vp->v_mountedhere = NULL;
 	vp->v_unpcb = NULL;
 	vp->v_rdev = NULL;
 	vp->v_fifoinfo = NULL;
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 	vp->v_irflag = 0;
 	vp->v_iflag = 0;
 	vp->v_vflag = 0;
 	bo->bo_flag = 0;
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
 	    ("Active vnode list size %d > Vnode list size %d",
 	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
 	if (vp->v_iflag & VI_ACTIVE) {
 		vp->v_iflag &= ~VI_ACTIVE;
 		mtx_lock(&mp->mnt_listmtx);
 		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
 		mp->mnt_activevnodelistsize--;
 		mtx_unlock(&mp->mnt_listmtx);
 	}
 	vp->v_mount = NULL;
 	VI_UNLOCK(vp);
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
 
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
 	 * holdcnt on it before we get it on both the vnode list
 	 * and the active vnode list. The mount mutex protects only
 	 * manipulation of the vnode list and the vnode freelist
 	 * mutex protects only manipulation of the active vnode list.
 	 * Hence the need to hold the vnode interlock throughout.
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
 	if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
 	    mp->mnt_nvnodelistsize == 0)) &&
 	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
 		VI_UNLOCK(vp);
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&mp->mnt_listmtx);
 	VI_UNLOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0 && !(flags & V_CLEANONLY))
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
 			BO_UNLOCK(bo);
 			vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
 			BO_LOCK(bo);
 		}
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
 		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
 		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
 	    V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
 	    bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
 	    bo->bo_dirty.bv_cnt > 0)
 		panic("vinvalbuf: flush dirty failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
 {
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	if (vp->v_object != NULL && vp->v_object->handle != vp)
 		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		/*
 		 * If we are flushing both V_NORMAL and V_ALT buffers then
 		 * do not skip any buffers. If we are flushing only V_NORMAL
 		 * buffers then skip buffers marked as BX_ALTDATA. If we are
 		 * flushing only V_ALT buffers then skip buffers not marked
 		 * as BX_ALTDATA.
 		 */
 		if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
 		   (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
 			continue;
 		}
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp == NULL)
 			break;
 		nbp = gbincore(bo, lblkno);
 		if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		    != xflags)
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 int
 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
 {
 	struct buf *bp;
 	int error;
 	daddr_t lblkno;
 
 	ASSERT_BO_LOCKED(bo);
 
 	for (lblkno = startn;;) {
 again:
 		bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
 		if (bp == NULL || bp->b_lblkno >= endn ||
 		    bp->b_lblkno < startn)
 			break;
 		error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
 		    LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
 		if (error != 0) {
 			BO_RLOCK(bo);
 			if (error == ENOLCK)
 				goto again;
 			return (error);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		lblkno = bp->b_lblkno + 1;
 		if ((bp->b_flags & B_MANAGED) == 0)
 			bremfree(bp);
 		bp->b_flags |= B_RELBUF;
 		/*
 		 * In the VMIO case, use the B_NOREUSE flag to hint that the
 		 * pages backing each buffer in the range are unlikely to be
 		 * reused.  Dirty buffers will have the hint applied once
 		 * they've been written.
 		 */
 		if ((bp->b_flags & B_VMIO) != 0)
 			bp->b_flags |= B_NOREUSE;
 		brelse(bp);
 		BO_RLOCK(bo);
 	}
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	daddr_t startlbn;
 
 	CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
 	    vp, blksize, (uintmax_t)length);
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	startlbn = howmany(length, blksize);
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 
 	bo = &vp->v_bufobj;
 restart_unlocked:
 	BO_LOCK(bo);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
 		;
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart_unlocked;
 
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	BO_UNLOCK(bo);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Invalidate the cached pages of a file's buffer within the range of block
  * numbers [startlbn, endlbn).
  */
 void
 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
     int blksize)
 {
 	struct bufobj *bo;
 	off_t start, end;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
 
 	start = blksize * startlbn;
 	end = blksize * endlbn;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	MPASS(blksize == bo->bo_bsize);
 
 	while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
 		;
 
 	BO_UNLOCK(bo);
 	vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
 }
 
 static int
 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
     daddr_t startlbn, daddr_t endlbn)
 {
 	struct buf *bp, *nbp;
 	bool anyfreed;
 
 	ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
 	ASSERT_BO_LOCKED(bo);
 
 	do {
 		anyfreed = false;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    nbp->b_vp != vp ||
 			    (nbp->b_flags & B_DELWRI) != 0))
 				return (EAGAIN);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) == ENOLCK) {
 				BO_LOCK(bo);
 				return (EAGAIN);
 			}
 			bremfree(bp);
 			bp->b_flags |= B_INVAL | B_RELBUF;
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = true;
 
 			BO_LOCK(bo);
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0))
 				return (EAGAIN);
 		}
 	} while (anyfreed);
 	return (0);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct bufv *bv;
 	struct buf *n;
 	int error;
 
 	ASSERT_BO_WLOCKED(bo);
 	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
 	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	/*
 	 * Keep the list ordered.  Optimize empty list insertion.  Assume
 	 * we tend to grow at the tail so lookup_le should usually be cheaper
 	 * than _ge. 
 	 */
 	if (bv->bv_cnt == 0 ||
 	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
 		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
 	else
 		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
 	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
 	if (error)
 		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
 }
 
 /*
  * Look up a buffer using the buffer tries.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
 	if (bp != NULL)
 		return (bp);
 	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, bo, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
 	vdrop(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
 
 static int
 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	*bo = LIST_FIRST(slp);
 	if (*bo == NULL)
 		return (0);
 	vp = bo2vnode(*bo);
 	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (*bo == LIST_FIRST(slp));
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 	BO_LOCK(*bo);
 	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	BO_UNLOCK(*bo);
 	vdrop(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 static int first_printf = 1;
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kproc_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining... ");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while (!LIST_EMPTY(slp)) {
 			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 
 			if (first_printf == 0) {
 				/*
 				 * Drop the sync mutex, because some watchdog
 				 * drivers need to sleep while patting
 				 */
 				mtx_unlock(&sync_mtx);
 				wdog_kern_pat(WD_LASTVAL);
 				mtx_lock(&sync_mtx);
 			}
 
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING ||
 		    time_uptime == starttime) {
 			thread_lock(td);
 			sched_prio(td, PPAUSE);
 			thread_unlock(td);
 		}
 		if (syncer_state != SYNCER_RUNNING)
 			cv_timedwait(&sync_wakeup, &sync_mtx,
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	int ret = 0;
 
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 
 	if (howto & RB_NOSYNC)
 		return;
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_shutdown(arg, howto);
 }
 
 void
 syncer_suspend(void)
 {
 
 	syncer_shutdown(updateproc, 0);
 }
 
 void
 syncer_resume(void)
 {
 
 	mtx_lock(&sync_mtx);
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	mtx_unlock(&sync_mtx);
 	cv_broadcast(&sync_wakeup);
 	kproc_resume(updateproc);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	BO_UNLOCK(bo);
 }
 
 static void
 v_init_counters(struct vnode *vp)
 {
 
 	VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
 	    vp, ("%s called for an initialized vnode", __FUNCTION__));
 	ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
 
 	refcount_init(&vp->v_holdcnt, 1);
 	refcount_init(&vp->v_usecount, 1);
 }
 
 /*
  * Increment si_usecount of the associated device, if any.
  */
 static void
 v_incr_devcount(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement si_usecount of the associated device, if any.
  */
 static void
 v_decr_devcount(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it.  VIRF_DOOMED is set if the vnode
  * is being destroyed.  Only callers who specify LK_RETRY will
  * see doomed vnodes.  If inactive processing was delayed in
  * vput try to do it here.
  *
  * Both holdcnt and usecount can be manipulated using atomics without holding
  * any locks except in these cases which require the vnode interlock:
  * holdcnt: 1->0 and 0->1
  * usecount: 0->1
  *
  * usecount is permitted to transition 1->0 without the interlock because
  * vnode is kept live by holdcnt.
  */
 static enum vgetstate __always_inline
 _vget_prep(struct vnode *vp, bool interlock)
 {
 	enum vgetstate vs;
 
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		vs = VGET_USECOUNT;
 	} else {
 		if (interlock)
 			vholdl(vp);
 		else
 			vhold(vp);
 		vs = VGET_HOLDCNT;
 	}
 	return (vs);
 }
 
 enum vgetstate
 vget_prep(struct vnode *vp)
 {
 
 	return (_vget_prep(vp, false));
 }
 
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	enum vgetstate vs;
 
 	MPASS(td == curthread);
 
 	vs = _vget_prep(vp, (flags & LK_INTERLOCK) != 0);
 	return (vget_finish(vp, flags, vs));
 }
 
 int
 vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
 {
 	int error, oweinact;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("%s: invalid lock operation", __func__));
 
 	if ((flags & LK_INTERLOCK) != 0)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
 	if (vs == VGET_USECOUNT) {
 		VNASSERT(vp->v_usecount > 0, vp,
 		    ("%s: vnode without usecount when VGET_USECOUNT was passed",
 		    __func__));
 	}
 
 	if ((error = vn_lock(vp, flags)) != 0) {
 		if (vs == VGET_USECOUNT)
 			vrele(vp);
 		else
 			vdrop(vp);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
 		return (error);
 	}
 
 	if (vs == VGET_USECOUNT) {
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("%s: vnode with usecount and VI_OWEINACT set", __func__));
 		return (0);
 	}
 
 	/*
 	 * We hold the vnode. If the usecount is 0 it will be utilized to keep
 	 * the vnode around. Otherwise someone else lended their hold count and
 	 * we have to drop ours.
 	 */
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 #ifdef INVARIANTS
 		int old = atomic_fetchadd_int(&vp->v_holdcnt, -1) - 1;
 		VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
 #else
 		refcount_release(&vp->v_holdcnt);
 #endif
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("%s: vnode with usecount and VI_OWEINACT set", __func__));
 		return (0);
 	}
 
 	/*
 	 * We don't guarantee that any particular close will
 	 * trigger inactive processing so just make a best effort
 	 * here at preventing a reference to a removed file.  If
 	 * we don't succeed no harm is done.
 	 *
 	 * Upgrade our holdcnt to a usecount.
 	 */
 	VI_LOCK(vp);
 	/*
 	 * See the previous section. By the time we get here we may find
 	 * ourselves in the same spot.
 	 */
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 #ifdef INVARIANTS
 		int old = atomic_fetchadd_int(&vp->v_holdcnt, -1) - 1;
 		VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
 #else
 		refcount_release(&vp->v_holdcnt);
 #endif
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("%s: vnode with usecount and VI_OWEINACT set",
 		    __func__));
 		VI_UNLOCK(vp);
 		return (0);
 	}
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		oweinact = 0;
 	} else {
 		oweinact = 1;
 		vp->v_iflag &= ~VI_OWEINACT;
 		VNODE_REFCOUNT_FENCE_REL();
 	}
 	v_incr_devcount(vp);
 	refcount_acquire(&vp->v_usecount);
 	if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
 	    (flags & LK_NOWAIT) == 0)
 		vinactive(vp, curthread);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference (use) and hold count of a vnode.
  * This will also remove the vnode from the free list if it is presently free.
  */
 void
 vref(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT(vp->v_holdcnt > 0, vp,
 		    ("%s: active vnode not held", __func__));
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("%s: vnode with usecount and VI_OWEINACT set", __func__));
 		return;
 	}
 	VI_LOCK(vp);
 	vrefl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vrefl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT(vp->v_holdcnt > 0, vp,
 		    ("%s: active vnode not held", __func__));
 		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
 		    ("%s: vnode with usecount and VI_OWEINACT set", __func__));
 		return;
 	}
 	vholdl(vp);
 	if ((vp->v_iflag & VI_OWEINACT) != 0) {
 		vp->v_iflag &= ~VI_OWEINACT;
 		VNODE_REFCOUNT_FENCE_REL();
 	}
 	v_incr_devcount(vp);
 	refcount_acquire(&vp->v_usecount);
 }
 
 void
 vrefact(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_usecount, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
 #else
 	refcount_acquire(&vp->v_usecount);
 #endif
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism is used to
  * stop other processes from gaining references to the vnode.  This may be the
  * case if the caller holds the only reference.  This is also useful when stale
  * data is acceptable as race conditions may be accounted for by some other
  * means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 
 	return (vp->v_usecount);
 }
 
 enum vputx_op { VPUTX_VRELE, VPUTX_VPUT, VPUTX_VUNREF };
 
 /*
  * Decrement the use and hold counts for a vnode.
  *
  * See an explanation near vget() as to why atomic operation is safe.
  */
 static void
 vputx(struct vnode *vp, enum vputx_op func)
 {
 	int error;
 
 	KASSERT(vp != NULL, ("vputx: null vp"));
 	if (func == VPUTX_VUNREF)
 		ASSERT_VOP_LOCKED(vp, "vunref");
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
 	    ("%s: wrong ref counts", __func__));
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 *
 	 * If we release the last usecount we take ownership of the hold
 	 * count which provides liveness of the vnode, in which case we
 	 * have to vdrop.
 	 */
 	if (!refcount_release(&vp->v_usecount))
 		return;
 	VI_LOCK(vp);
 	v_decr_devcount(vp);
 	/*
 	 * By the time we got here someone else might have transitioned
 	 * the count back to > 0.
 	 */
 	if (vp->v_usecount > 0) {
 		vdropl(vp);
 		return;
 	}
 	if (vp->v_iflag & VI_DOINGINACT) {
 		vdropl(vp);
 		return;
 	}
 
 	/*
 	 * Check if the fs wants to perform inactive processing. Note we
 	 * may be only holding the interlock, in which case it is possible
 	 * someone else called vgone on the vnode and ->v_data is now NULL.
 	 * Since vgone performs inactive on its own there is nothing to do
 	 * here but to drop our hold count.
 	 */
 	if (__predict_false(VN_IS_DOOMED(vp)) ||
 	    VOP_NEED_INACTIVE(vp) == 0) {
 		vdropl(vp);
 		return;
 	}
 
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	switch (func) {
 	case VPUTX_VRELE:
 		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
 		VI_LOCK(vp);
 		break;
 	case VPUTX_VPUT:
 		error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT);
 		VI_LOCK(vp);
 		break;
 	case VPUTX_VUNREF:
 		error = 0;
 		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
 			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
 			VI_LOCK(vp);
 		}
 		break;
 	}
 	VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
 	    ("vnode with usecount and VI_OWEINACT set"));
 	if (error == 0) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, curthread);
 		if (func != VPUTX_VUNREF)
 			VOP_UNLOCK(vp, 0);
 	}
 	vdropl(vp);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VRELE);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() acquires the lock internally.)
  *
  * It is an invariant that all VOP_* calls operate on a held vnode.
  * We may be only having an implicit hold stemming from our usecount,
  * which we are about to release. If we unlock the vnode afterwards we
  * open a time window where someone else dropped the last usecount and
  * proceeded to free the vnode before our unlock finished. For this
  * reason we unlock the vnode early. This is a little bit wasteful as
  * it may be the vnode is exclusively locked and inactive processing is
  * needed, in which case we are adding work.
  */
 void
 vput(struct vnode *vp)
 {
 
 	VOP_UNLOCK(vp, 0);
 	vputx(vp, VPUTX_VPUT);
 }
 
 /*
  * Release an exclusively locked vnode. Do not unlock the vnode lock.
  */
 void
 vunref(struct vnode *vp)
 {
 
 	vputx(vp, VPUTX_VUNREF);
 }
 
 /*
  * Increase the hold count and activate if this is the first reference.
  */
 static void
 vhold_activate(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_holdcnt == 0, vp,
 	    ("%s: wrong hold count", __func__));
 	VNASSERT(vp->v_op != NULL, vp,
 	    ("%s: vnode already reclaimed.", __func__));
 	/*
 	 * Remove a vnode from the free list, mark it as in use,
 	 * and put it on the active list.
 	 */
 	VNASSERT(vp->v_mount != NULL, vp,
 	    ("_vhold: vnode not on per mount vnode list"));
 	mp = vp->v_mount;
 	mtx_lock(&mp->mnt_listmtx);
 	if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
 		TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
 		mp->mnt_tmpfreevnodelistsize--;
 		vp->v_mflag &= ~VMP_TMPMNTFREELIST;
 	} else {
 		mtx_lock(&vnode_free_list_mtx);
 		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 		freevnodes--;
 		mtx_unlock(&vnode_free_list_mtx);
 	}
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag &= ~VI_FREE;
 	vp->v_iflag |= VI_ACTIVE;
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&mp->mnt_listmtx);
 	refcount_acquire(&vp->v_holdcnt);
 }
 
 void
 vhold(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
 		VNODE_REFCOUNT_FENCE_ACQ();
 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 		    ("vhold: vnode with holdcnt is free"));
 		return;
 	}
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if ((vp->v_iflag & VI_FREE) == 0) {
 		refcount_acquire(&vp->v_holdcnt);
 		return;
 	}
 	vhold_activate(vp);
 }
 
 void
 vholdnz(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
 	int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
 	VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
 #else
 	atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VIRF_DOOMED) in which case we will free it.
  *
  * Because the vnode vm object keeps a hold reference on the vnode if
  * there is at least one resident non-cached page, the vnode cannot
  * leave the active list without the page cleanup done.
  */
 static void
 vdrop_deactivate(struct vnode *vp)
 {
 	struct mount *mp;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	/*
 	 * Mark a vnode as free: remove it from its active list
 	 * and put it up for recycling on the freelist.
 	 */
 	VNASSERT(!VN_IS_DOOMED(vp), vp,
 	    ("vdrop: returning doomed vnode"));
 	VNASSERT(vp->v_op != NULL, vp,
 	    ("vdrop: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("vnode already free"));
 	VNASSERT(vp->v_holdcnt == 0, vp,
 	    ("vdrop: freeing when we shouldn't"));
 	if ((vp->v_iflag & VI_OWEINACT) == 0) {
 		mp = vp->v_mount;
 		if (mp != NULL) {
 			mtx_lock(&mp->mnt_listmtx);
 			if (vp->v_iflag & VI_ACTIVE) {
 				vp->v_iflag &= ~VI_ACTIVE;
 				TAILQ_REMOVE(&mp->mnt_activevnodelist,
 				    vp, v_actfreelist);
 				mp->mnt_activevnodelistsize--;
 			}
 			TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist,
 			    vp, v_actfreelist);
 			mp->mnt_tmpfreevnodelistsize++;
 			vp->v_iflag |= VI_FREE;
 			vp->v_mflag |= VMP_TMPMNTFREELIST;
 			VI_UNLOCK(vp);
 			if (mp->mnt_tmpfreevnodelistsize >=
 			    mnt_free_list_batch)
 				vnlru_return_batch_locked(mp);
 			mtx_unlock(&mp->mnt_listmtx);
 		} else {
 			VNASSERT((vp->v_iflag & VI_ACTIVE) == 0, vp,
 			    ("vdrop: active vnode not on per mount vnode list"));
 			mtx_lock(&vnode_free_list_mtx);
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp,
 			    v_actfreelist);
 			freevnodes++;
 			vp->v_iflag |= VI_FREE;
 			VI_UNLOCK(vp);
 			mtx_unlock(&vnode_free_list_mtx);
 		}
 	} else {
 		VI_UNLOCK(vp);
 		counter_u64_add(free_owe_inact, 1);
 	}
 }
 
 void
 vdrop(struct vnode *vp)
 {
 
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (__predict_false((int)vp->v_holdcnt <= 0)) {
 		vn_printf(vp, "vdrop: holdcnt %d", vp->v_holdcnt);
 		panic("vdrop: wrong holdcnt");
 	}
 	if (refcount_release_if_not_last(&vp->v_holdcnt))
 		return;
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 void
 vdropl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (__predict_false((int)vp->v_holdcnt <= 0)) {
 		vn_printf(vp, "vdrop: holdcnt %d", vp->v_holdcnt);
 		panic("vdrop: wrong holdcnt");
 	}
 	if (!refcount_release(&vp->v_holdcnt)) {
 		VI_UNLOCK(vp);
 		return;
 	}
 	if (VN_IS_DOOMED(vp)) {
 		freevnode(vp);
 		return;
 	}
 	vdrop_deactivate(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 void
 vinactive(struct vnode *vp, struct thread *td)
 {
 	struct vm_object *obj;
 
 	ASSERT_VOP_ELOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
 	 * modified pages are converted into the vnode's dirty
 	 * buffers, since these will no longer be checked once the
 	 * vnode is on the inactive list.
 	 *
 	 * The write-out of the dirty pages is asynchronous.  At the
 	 * point that VOP_INACTIVE() is called, there could still be
 	 * pending I/O and dirty pages in the object.
 	 */
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj)) {
 		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
 #endif
 
 int
 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
 	    rootrefs, flags);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
 			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
 			    __func__, error);
 			return (error);
 		}
 		vput(rootvp);
 	}
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
 		vholdl(vp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
 		if (error) {
 			vdrop(vp);
 			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0);
 			vdrop(vp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
 				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
 				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 			if (error != 0) {
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
 				return (error);
 			}
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount <= 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0);
 				vdropl(vp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vn_printf(vp, "vflush: busy vnode ");
 #endif
 		}
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
 	}
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy) {
 		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
 		    busy);
 		return (EBUSY);
 	}
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp)
 {
 	int recycled;
 
 	VI_LOCK(vp);
 	recycled = vrecyclel(vp);
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * vrecycle, with the vp interlock held.
  */
 int
 vrecyclel(struct vnode *vp)
 {
 	int recycled;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	ASSERT_VI_LOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	recycled = 0;
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 static void
 notify_lowervp_vfs_dummy(struct mount *mp __unused,
     struct vnode *lowervp __unused)
 {
 }
 
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
 void
 vfs_notify_upper(struct vnode *vp, int event)
 {
 	static struct vfsops vgonel_vfsops = {
 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 	};
 	struct mount *mp, *ump, *mmp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	if (TAILQ_EMPTY(&mp->mnt_uppers))
 		return;
 
 	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
 	mmp->mnt_op = &vgonel_vfsops;
 	mmp->mnt_kern_flag |= MNTK_MARKER;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
 	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
 		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
 			ump = TAILQ_NEXT(ump, mnt_upper_link);
 			continue;
 		}
 		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
 			VFS_RECLAIM_LOWERVP(ump, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
 			VFS_UNLINK_LOWERVP(ump, vp);
 			break;
 		default:
 			KASSERT(0, ("invalid event %d", event));
 			break;
 		}
 		MNT_ILOCK(mp);
 		ump = TAILQ_NEXT(mmp, mnt_upper_link);
 		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
 	}
 	free(mmp, M_TEMP);
 	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
 	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
 		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	struct mount *mp;
 	vm_object_t object;
 	bool active, oweinact;
 
 	ASSERT_VOP_ELOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_irflag & VIRF_DOOMED)
 		return;
 	vp->v_irflag |= VIRF_DOOMED;
 
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount > 0;
 	oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
 	VI_UNLOCK(vp);
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
 		while (vinvalbuf(vp, 0, 0, 0) != 0)
 			;
 	}
 
 	BO_LOCK(&vp->v_bufobj);
 	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
 	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
 	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
 	    vp->v_bufobj.bo_clean.bv_cnt == 0,
 	    ("vp %p bufobj not invalidated", vp));
 
 	/*
 	 * For VMIO bufobj, BO_DEAD is set later, or in
 	 * vm_object_terminate() after the object's page queue is
 	 * flushed.
 	 */
 	object = vp->v_bufobj.bo_object;
 	if (object == NULL)
 		vp->v_bufobj.bo_flag |= BO_DEAD;
 	BO_UNLOCK(&vp->v_bufobj);
 
 	/*
 	 * Handle the VM part.  Tmpfs handles v_object on its own (the
 	 * OBJT_VNODE check).  Nullfs or other bypassing filesystems
 	 * should not touch the object borrowed from the lower vnode
 	 * (the handle check).
 	 */
 	if (object != NULL && object->type == OBJT_VNODE &&
 	    object->handle == vp)
 		vnode_destroy_vobject(vp);
 
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
 	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[256], buf2[16];
 	u_long flags;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
 	switch (vp->v_type) {
 	case VDIR:
 		printf(" mountedhere %p\n", vp->v_mountedhere);
 		break;
 	case VCHR:
 		printf(" rdev %p\n", vp->v_rdev);
 		break;
 	case VSOCK:
 		printf(" socket %p\n", vp->v_unpcb);
 		break;
 	case VFIFO:
 		printf(" fifoinfo %p\n", vp->v_fifoinfo);
 		break;
 	default:
 		printf("\n");
 		break;
 	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_irflag & VIRF_DOOMED)
 		strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
 	flags = vp->v_irflag & ~(VIRF_DOOMED);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_vflag & VV_ROOT)
 		strlcat(buf, "|VV_ROOT", sizeof(buf));
 	if (vp->v_vflag & VV_ISTTY)
 		strlcat(buf, "|VV_ISTTY", sizeof(buf));
 	if (vp->v_vflag & VV_NOSYNC)
 		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
 	if (vp->v_vflag & VV_ETERNALDEV)
 		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
 	if (vp->v_vflag & VV_CACHEDLABEL)
 		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_VMSIZEVNLOCK)
 		strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
 	if (vp->v_vflag & VV_COPYONWRITE)
 		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
 		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
 	if (vp->v_vflag & VV_PROCDEP)
 		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
 	if (vp->v_vflag & VV_NOKNOTE)
 		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
 	if (vp->v_vflag & VV_DELETED)
 		strlcat(buf, "|VV_DELETED", sizeof(buf));
 	if (vp->v_vflag & VV_MD)
 		strlcat(buf, "|VV_MD", sizeof(buf));
 	if (vp->v_vflag & VV_FORCEINSMQ)
 		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
 	if (vp->v_vflag & VV_READLINK)
 		strlcat(buf, "|VV_READLINK", sizeof(buf));
 	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
 	    VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
 	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_iflag & VI_TEXT_REF)
 		strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
 		strlcat(buf, "|VI_FREE", sizeof(buf));
 	if (vp->v_iflag & VI_ACTIVE)
 		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
 	if (vp->v_iflag & VI_DOINGINACT)
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
 	flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE |
 	    VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	if (vp->v_mflag & VMP_TMPMNTFREELIST)
 		strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf));
 	flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
 		strlcat(buf, buf2, sizeof(buf));
 	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d "
 		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count,
 		    vp->v_bufobj.bo_clean.bv_cnt,
 		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vn_printf(vp, "vnode ");
 		}
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 
 /*
  * Show details about the given mount point.
  */
 DB_SHOW_COMMAND(mount, db_show_mount)
 {
 	struct mount *mp;
 	struct vfsopt *opt;
 	struct statfs *sp;
 	struct vnode *vp;
 	char buf[512];
 	uint64_t mflags;
 	u_int flags;
 
 	if (!have_addr) {
 		/* No address given, print short info about all mount points. */
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			db_printf("%p %s on %s (%s)\n", mp,
 			    mp->mnt_stat.f_mntfromname,
 			    mp->mnt_stat.f_mntonname,
 			    mp->mnt_stat.f_fstypename);
 			if (db_pager_quit)
 				break;
 		}
 		db_printf("\nMore info: show mount <addr>\n");
 		return;
 	}
 
 	mp = (struct mount *)addr;
 	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
 	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
 
 	buf[0] = '\0';
 	mflags = mp->mnt_flag;
 #define	MNT_FLAG(flag)	do {						\
 	if (mflags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 4, sizeof(buf));			\
 		mflags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_FLAG(MNT_RDONLY);
 	MNT_FLAG(MNT_SYNCHRONOUS);
 	MNT_FLAG(MNT_NOEXEC);
 	MNT_FLAG(MNT_NOSUID);
 	MNT_FLAG(MNT_NFS4ACLS);
 	MNT_FLAG(MNT_UNION);
 	MNT_FLAG(MNT_ASYNC);
 	MNT_FLAG(MNT_SUIDDIR);
 	MNT_FLAG(MNT_SOFTDEP);
 	MNT_FLAG(MNT_NOSYMFOLLOW);
 	MNT_FLAG(MNT_GJOURNAL);
 	MNT_FLAG(MNT_MULTILABEL);
 	MNT_FLAG(MNT_ACLS);
 	MNT_FLAG(MNT_NOATIME);
 	MNT_FLAG(MNT_NOCLUSTERR);
 	MNT_FLAG(MNT_NOCLUSTERW);
 	MNT_FLAG(MNT_SUJ);
 	MNT_FLAG(MNT_EXRDONLY);
 	MNT_FLAG(MNT_EXPORTED);
 	MNT_FLAG(MNT_DEFEXPORTED);
 	MNT_FLAG(MNT_EXPORTANON);
 	MNT_FLAG(MNT_EXKERB);
 	MNT_FLAG(MNT_EXPUBLIC);
 	MNT_FLAG(MNT_LOCAL);
 	MNT_FLAG(MNT_QUOTA);
 	MNT_FLAG(MNT_ROOTFS);
 	MNT_FLAG(MNT_USER);
 	MNT_FLAG(MNT_IGNORE);
 	MNT_FLAG(MNT_UPDATE);
 	MNT_FLAG(MNT_DELEXPORT);
 	MNT_FLAG(MNT_RELOAD);
 	MNT_FLAG(MNT_FORCE);
 	MNT_FLAG(MNT_SNAPSHOT);
 	MNT_FLAG(MNT_BYFSID);
 #undef MNT_FLAG
 	if (mflags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%016jx", mflags);
 	}
 	db_printf("    mnt_flag = %s\n", buf);
 
 	buf[0] = '\0';
 	flags = mp->mnt_kern_flag;
 #define	MNT_KERN_FLAG(flag)	do {					\
 	if (flags & (flag)) {						\
 		if (buf[0] != '\0')					\
 			strlcat(buf, ", ", sizeof(buf));		\
 		strlcat(buf, (#flag) + 5, sizeof(buf));			\
 		flags &= ~(flag);					\
 	}								\
 } while (0)
 	MNT_KERN_FLAG(MNTK_UNMOUNTF);
 	MNT_KERN_FLAG(MNTK_ASYNC);
 	MNT_KERN_FLAG(MNTK_SOFTDEP);
 	MNT_KERN_FLAG(MNTK_DRAINING);
 	MNT_KERN_FLAG(MNTK_REFEXPIRE);
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
 	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
 			strlcat(buf, ", ", sizeof(buf));
 		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
 		    "0x%08x", flags);
 	}
 	db_printf("    mnt_kern_flag = %s\n", buf);
 
 	db_printf("    mnt_opt = ");
 	opt = TAILQ_FIRST(mp->mnt_opt);
 	if (opt != NULL) {
 		db_printf("%s", opt->name);
 		opt = TAILQ_NEXT(opt, link);
 		while (opt != NULL) {
 			db_printf(", %s", opt->name);
 			opt = TAILQ_NEXT(opt, link);
 		}
 	}
 	db_printf("\n");
 
 	sp = &mp->mnt_stat;
 	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
 	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
 	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
 	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
 	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
 	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
 	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
 	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
 	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
 	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
 	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
 	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
 
 	db_printf("    mnt_cred = { uid=%u ruid=%u",
 	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
 	if (jailed(mp->mnt_cred))
 		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
 	db_printf(" }\n");
 	db_printf("    mnt_ref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
 	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
 	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
 	db_printf("    mnt_activevnodelistsize = %d\n",
 	    mp->mnt_activevnodelistsize);
 	db_printf("    mnt_writeopcount = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
 	db_printf("    mnt_lockref = %d (with %d in the struct)\n",
 	    vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
 	db_printf("    mnt_gjprovider = %s\n",
 	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
 	db_printf("    mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
 
 	db_printf("\n\nList of active vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
 		if (vp->v_type != VMARKER) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 	db_printf("\n\nList of inactive vnodes\n");
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
 			vn_printf(vp, "vnode ");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static int
 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp.vfc_vfsops = NULL;
 	xvfsp.vfc_next = NULL;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 
 #ifdef COMPAT_FREEBSD32
 struct xvfsconf32 {
 	uint32_t	vfc_vfsops;
 	char		vfc_name[MFSNAMELEN];
 	int32_t		vfc_typenum;
 	int32_t		vfc_refcount;
 	int32_t		vfc_flags;
 	uint32_t	vfc_next;
 };
 
 static int
 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
 {
 	struct xvfsconf32 xvfsp;
 
 	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	int error;
 
 	error = 0;
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			error = vfsconf2x32(req, vfsp);
 		else
 #endif
 			error = vfsconf2x(req, vfsp);
 		if (error)
 			break;
 	}
 	vfsconf_sunlock();
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		vfsconf_slock();
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		}
 		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
 			return (vfsconf2x32(req, vfsp));
 		else
 #endif
 			return (vfsconf2x(req, vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, vfs_sysctl,
     "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error != 0) {
 			vfsconf_sunlock();
 			return (error);
 		}
 	}
 	vfsconf_sunlock();
 	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
     "");
 #endif
 
 static void
 unmount_or_warn(struct mount *mp)
 {
 	int error;
 
 	error = dounmount(mp, MNT_FORCE, curthread);
 	if (error != 0) {
 		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
 		vfs_ref(mp);
 
 		/*
 		 * Forcibly unmounting "/dev" before "/" would prevent clean
 		 * unmount of the latter.
 		 */
 		if (mp == rootdevmp)
 			continue;
 
 		unmount_or_warn(mp);
 	}
 
 	if (rootdevmp != NULL)
 		unmount_or_warn(rootdevmp);
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
 
 	if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
 		return;
 
 	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
 		obj = vp->v_object;
 		if (obj != NULL && vm_object_mightbedirty(obj) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_WLOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_WUNLOCK(obj);
 				}
 				vput(vp);
 			}
 		} else
 			VI_UNLOCK(vp);
 	}
 }
 
 static void
 destroy_vpollinfo_free(struct vpollinfo *vi)
 {
 
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	uma_zfree(vnodepoll_zone, vi);
 }
 
 static void
 destroy_vpollinfo(struct vpollinfo *vi)
 {
 
 	knlist_clear(&vi->vpi_selinfo.si_note, 1);
 	seldrain(&vi->vpi_selinfo);
 	destroy_vpollinfo_free(vi);
 }
 
 /*
  * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	if (vp->v_pollinfo != NULL)
 		return;
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
 	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
 		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	VI_UNLOCK(vp);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return (events);
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return (0);
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
+VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 void
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: getnewvnode() failed");
 	vp->v_type = VNON;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vp->v_vflag |= VV_FORCEINSMQ;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque() failed");
 	vp->v_vflag &= ~VV_FORCEINSMQ;
 	VOP_UNLOCK(vp, 0);
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	if (mp->mnt_syncer == NULL) {
 		mp->mnt_syncer = vp;
 		vp = NULL;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 	if (vp != NULL) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vgone(vp);
 		vput(vp);
 	}
 }
 
 void
 vfs_deallocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 
 	mtx_lock(&sync_mtx);
 	vp = mp->mnt_syncer;
 	if (vp != NULL)
 		mp->mnt_syncer = NULL;
 	mtx_unlock(&sync_mtx);
 	if (vp != NULL)
 		vrele(vp);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	int error, save;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp);
 		return (0);
 	}
 	save = curthread_pflags_set(TDP_SYNCIO);
 	/*
 	 * The filesystem at hand may be idle with free vnodes stored in the
 	 * batch.  Return them instead of letting them stay there indefinitely.
 	 */
 	vnlru_return_batch(mp);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY);
 	curthread_pflags_restore(save);
 	vn_finished_write(mp);
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 	mtx_lock(&sync_mtx);
 	if (vp->v_mount->mnt_syncer == vp)
 		vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	mtx_unlock(&sync_mtx);
 	BO_UNLOCK(bo);
 
 	return (0);
 }
 
 int
 vn_need_pageq_flush(struct vnode *vp)
 {
 	struct vm_object *obj;
 	int need;
 
 	MPASS(mtx_owned(VI_MTX(vp)));
 	need = 0;
 	if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
 	    vm_object_mightbedirty(obj))
 		need = 1;
 	return (need);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	if (vp->v_type != VCHR) {
 		error = ENOTBLK;
 		goto out;
 	}
 	error = 0;
 	dev_lock();
 	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 out:
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     accmode_t accmode, struct ucred *cred, int *privused)
 {
 	accmode_t dac_granted;
 	accmode_t priv_granted;
 
 	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
 	    ("invalid bit in accmode"));
 	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
 	    ("VAPPEND without VWRITE"));
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((accmode & dac_granted) == accmode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((accmode & dac_granted) == accmode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP))
 			priv_granted |= VEXEC;
 	} else {
 		/*
 		 * Ensure that at least one execute bit is on. Otherwise,
 		 * a privileged user will always succeed, and we don't want
 		 * this to happen unless the file really is executable.
 		 */
 		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC))
 			priv_granted |= VEXEC;
 	}
 
 	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ))
 		priv_granted |= VREAD;
 
 	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN))
 		priv_granted |= VADMIN;
 
 	if ((accmode & (priv_granted | dac_granted)) == accmode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 	return ((accmode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, accmode_t accmode)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, accmode, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to suppress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
 	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
     "Drop into debugger on lock violation");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
     0, "Check for interlock across VOPs");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
     0, "Print lock violations");
 
 int vfs_badlock_vnode = 1;	/* Print vnode details on lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
     0, "Print vnode details on lock violations");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
     &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_vnode)
 		vn_printf(vp, "vnode ");
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 	int locked;
 
 	if (!IGNORE_LOCK(vp)) {
 		locked = VOP_ISLOCKED(vp);
 		if (locked == 0 || locked == LK_EXCLOTHER)
 			vfs_badlock("is not locked but should be", str, vp);
 	}
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_fail(struct vop_rename_args *ap)
 {
 
 	if (ap->a_tvp != NULL)
 		vput(ap->a_tvp);
 	if (ap->a_tdvp == ap->a_tvp)
 		vrele(ap->a_tdvp);
 	else
 		vput(ap->a_tdvp);
 	vrele(ap->a_fdvp);
 	vrele(ap->a_fvp);
 }
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
 	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 #ifdef DEBUG_VFS_LOCKS
 void
 vop_strategy_pre(void *ap)
 {
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
 	}
 }
 
 void
 vop_lock_pre(void *ap)
 {
 	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 }
 
 void
 vop_need_inactive_pre(void *ap)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 
 void
 vop_need_inactive_post(void *ap, int rc)
 {
 	struct vop_need_inactive_args *a = ap;
 
 	ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
 }
 #endif
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_deleteextattr_post(void *ap, int rc)
 {
 	struct vop_deleteextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_reclaim_post(void *ap, int rc)
 {
 	struct vop_reclaim_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 	long hint;
 
 	if (!rc) {
 		hint = NOTE_WRITE;
 		if (a->a_fdvp == a->a_tdvp) {
 			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		} else {
 			hint |= NOTE_EXTEND;
 			if (a->a_fvp->v_type == VDIR)
 				hint |= NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
 
 			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
 			    a->a_tvp->v_type == VDIR)
 				hint &= ~NOTE_LINK;
 			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
 		}
 
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_setextattr_post(void *ap, int rc)
 {
 	struct vop_setextattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_open_post(void *ap, int rc)
 {
 	struct vop_open_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
 }
 
 void
 vop_close_post(void *ap, int rc)
 {
 	struct vop_close_args *a = ap;
 
 	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
 	    !VN_IS_DOOMED(a->a_vp))) {
 		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
 		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
 	}
 }
 
 void
 vop_read_post(void *ap, int rc)
 {
 	struct vop_read_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 void
 vop_readdir_post(void *ap, int rc)
 {
 	struct vop_readdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init_mtx(&fs_knlist, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_fsattach,
 	.f_detach = filt_fsdetach,
 	.f_event = filt_fsevent
 };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
     NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsread
 };
 static struct filterops vfswrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfswrite
 };
 static struct filterops vfsvnode_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_vfsdetach,
 	.f_event = filt_vfsvnode
 };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0);
 }
 
 static void
 vfs_knl_assert_locked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
 #endif
 }
 
 static void
 vfs_knl_assert_unlocked(void *arg)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vnode *vp = arg;
 
 	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
 #endif
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 	vdrop(vp);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 	int res;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
 		return (0);
 
 	VI_LOCK(vp);
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	VI_LOCK(vp);
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	VI_UNLOCK(vp);
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	int res;
 
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
 	}
 	res = (kn->kn_fflags != 0);
 	VI_UNLOCK(vp);
 	return (res);
 }
 
 /*
  * Returns whether the directory is empty or not.
  * If it is empty, the return value is 0; otherwise
  * the return value is an error value (which may
  * be ENOTEMPTY).
  */
 int
 vfs_emptydir(struct vnode *vp)
 {
 	struct uio uio;
 	struct iovec iov;
 	struct dirent *dirent, *dp, *endp;
 	int error, eof;
 
 	error = 0;
 	eof = 0;
 
 	ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
 
 	dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
 	iov.iov_base = dirent;
 	iov.iov_len = sizeof(struct dirent);
 
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = sizeof(struct dirent);
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 
 	while (eof == 0 && error == 0) {
 		error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
 		    NULL, NULL);
 		if (error != 0)
 			break;
 		endp = (void *)((uint8_t *)dirent +
 		    sizeof(struct dirent) - uio.uio_resid);
 		for (dp = dirent; dp < endp;
 		     dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
 			if (dp->d_type == DT_WHT)
 				continue;
 			if (dp->d_namlen == 0)
 				continue;
 			if (dp->d_type != DT_DIR &&
 			    dp->d_type != DT_UNKNOWN) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen > 2) {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 1 &&
 			    dp->d_name[0] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			if (dp->d_namlen == 2 &&
 			    dp->d_name[1] != '.') {
 				error = ENOTEMPTY;
 				break;
 			}
 			uio.uio_resid = sizeof(struct dirent);
 		}
 	}
 	free(dirent, M_TEMP);
 	return (error);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	*ap->a_ncookies += 1;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VOP_MARKATIME.  This functionality is used by execve and
  * mmap, so we want to avoid the I/O implied by directly setting
  * va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct ucred *cred)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		(void)VOP_MARKATIME(vp);
 }
 
 /*
  * The purpose of this routine is to remove granularity from accmode_t,
  * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
  * VADMIN and VAPPEND.
  *
  * If it returns 0, the caller is supposed to continue with the usual
  * access checks using 'accmode' as modified by this routine.  If it
  * returns nonzero value, the caller is supposed to return that value
  * as errno.
  *
  * Note that after this routine runs, accmode may be zero.
  */
 int
 vfs_unixify_accmode(accmode_t *accmode)
 {
 	/*
 	 * There is no way to specify explicit "deny" rule using
 	 * file mode or POSIX.1e ACLs.
 	 */
 	if (*accmode & VEXPLICIT_DENY) {
 		*accmode = 0;
 		return (0);
 	}
 
 	/*
 	 * None of these can be translated into usual access bits.
 	 * Also, the common case for NFSv4 ACLs is to not contain
 	 * either of these bits. Caller should check for VWRITE
 	 * on the containing directory instead.
 	 */
 	if (*accmode & (VDELETE_CHILD | VDELETE))
 		return (EPERM);
 
 	if (*accmode & VADMIN_PERMS) {
 		*accmode &= ~VADMIN_PERMS;
 		*accmode |= VADMIN;
 	}
 
 	/*
 	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
 	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
 	 */
 	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
 
 	return (0);
 }
 
 /*
  * Clear out a doomed vnode (if any) and replace it with a new one as long
  * as the fs is not being unmounted. Return the root vnode to the caller.
  */
 static int __noinline
 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 restart:
 	if (mp->mnt_rootvnode != NULL) {
 		MNT_ILOCK(mp);
 		vp = mp->mnt_rootvnode;
 		if (vp != NULL) {
 			if (!VN_IS_DOOMED(vp)) {
 				vrefact(vp);
 				MNT_IUNLOCK(mp);
 				error = vn_lock(vp, flags);
 				if (error == 0) {
 					*vpp = vp;
 					return (0);
 				}
 				vrele(vp);
 				goto restart;
 			}
 			/*
 			 * Clear the old one.
 			 */
 			mp->mnt_rootvnode = NULL;
 		}
 		MNT_IUNLOCK(mp);
 		if (vp != NULL) {
 			/*
 			 * Paired with a fence in vfs_op_thread_exit().
 			 */
 			atomic_thread_fence_acq();
 			vfs_op_barrier_wait(mp);
 			vrele(vp);
 		}
 	}
 	error = VFS_CACHEDROOT(mp, flags, vpp);
 	if (error != 0)
 		return (error);
 	if (mp->mnt_vfs_ops == 0) {
 		MNT_ILOCK(mp);
 		if (mp->mnt_vfs_ops != 0) {
 			MNT_IUNLOCK(mp);
 			return (0);
 		}
 		if (mp->mnt_rootvnode == NULL) {
 			vrefact(*vpp);
 			mp->mnt_rootvnode = *vpp;
 		} else {
 			if (mp->mnt_rootvnode != *vpp) {
 				if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
 					panic("%s: mismatch between vnode returned "
 					    " by VFS_CACHEDROOT and the one cached "
 					    " (%p != %p)",
 					    __func__, *vpp, mp->mnt_rootvnode);
 				}
 			}
 		}
 		MNT_IUNLOCK(mp);
 	}
 	return (0);
 }
 
 int
 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (!vfs_op_thread_enter(mp))
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	vp = (struct vnode *)atomic_load_ptr(&mp->mnt_rootvnode);
 	if (vp == NULL || VN_IS_DOOMED(vp)) {
 		vfs_op_thread_exit(mp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	vrefact(vp);
 	vfs_op_thread_exit(mp);
 	error = vn_lock(vp, flags);
 	if (error != 0) {
 		vrele(vp);
 		return (vfs_cache_root_fallback(mp, flags, vpp));
 	}
 	*vpp = vp;
 	return (0);
 }
 
 struct vnode *
 vfs_cache_root_clear(struct mount *mp)
 {
 	struct vnode *vp;
 
 	/*
 	 * ops > 0 guarantees there is nobody who can see this vnode
 	 */
 	MPASS(mp->mnt_vfs_ops > 0);
 	vp = mp->mnt_rootvnode;
 	mp->mnt_rootvnode = NULL;
 	return (vp);
 }
 
 void
 vfs_cache_root_set(struct mount *mp, struct vnode *vp)
 {
 
 	MPASS(mp->mnt_vfs_ops > 0);
 	vrefact(vp);
 	mp->mnt_rootvnode = vp;
 }
 
 /*
  * These are helper functions for filesystems to traverse all
  * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
  *
  * This interface replaces MNT_VNODE_FOREACH.
  */
 
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
 
 struct vnode *
 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
 	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
 		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
 		return (NULL);
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	(*mvp)->v_mount = mp;
 	(*mvp)->v_type = VMARKER;
 
 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 		/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
 		if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
 			continue;
 		VI_LOCK(vp);
 		if (VN_IS_DOOMED(vp)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		break;
 	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		free(*mvp, M_VNODE_MARKER);
 		*mvp = NULL;
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL) {
 		MNT_IUNLOCK(mp);
 		return;
 	}
 
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 /*
  * These are helper functions for filesystems to traverse their
  * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
  */
 static void
 mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	free(*mvp, M_VNODE_MARKER);
 	*mvp = NULL;
 }
 
 /*
  * Relock the mp mount vnode list lock with the vp vnode interlock in the
  * conventional lock order during mnt_vnode_next_active iteration.
  *
  * On entry, the mount vnode list lock is held and the vnode interlock is not.
  * The list lock is dropped and reacquired.  On success, both locks are held.
  * On failure, the mount vnode list lock is held but the vnode interlock is
  * not, and the procedure may have yielded.
  */
 static bool
 mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
     struct vnode *vp)
 {
 	const struct vnode *tmp;
 	bool held, ret;
 
 	VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
 	    TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
 	    ("%s: bad marker", __func__));
 	VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
 	    ("%s: inappropriate vnode", __func__));
 	ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 
 	ret = false;
 
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
 	TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
 
 	/*
 	 * Use a hold to prevent vp from disappearing while the mount vnode
 	 * list lock is dropped and reacquired.  Normally a hold would be
 	 * acquired with vhold(), but that might try to acquire the vnode
 	 * interlock, which would be a LOR with the mount vnode list lock.
 	 */
 	held = refcount_acquire_if_not_zero(&vp->v_holdcnt);
 	mtx_unlock(&mp->mnt_listmtx);
 	if (!held)
 		goto abort;
 	VI_LOCK(vp);
 	if (!refcount_release_if_not_last(&vp->v_holdcnt)) {
 		vdropl(vp);
 		goto abort;
 	}
 	mtx_lock(&mp->mnt_listmtx);
 
 	/*
 	 * Determine whether the vnode is still the next one after the marker,
 	 * excepting any other markers.  If the vnode has not been doomed by
 	 * vgone() then the hold should have ensured that it remained on the
 	 * active list.  If it has been doomed but is still on the active list,
 	 * don't abort, but rather skip over it (avoid spinning on doomed
 	 * vnodes).
 	 */
 	tmp = mvp;
 	do {
 		tmp = TAILQ_NEXT(tmp, v_actfreelist);
 	} while (tmp != NULL && tmp->v_type == VMARKER);
 	if (tmp != vp) {
 		mtx_unlock(&mp->mnt_listmtx);
 		VI_UNLOCK(vp);
 		goto abort;
 	}
 
 	ret = true;
 	goto out;
 abort:
 	maybe_yield();
 	mtx_lock(&mp->mnt_listmtx);
 out:
 	if (ret)
 		ASSERT_VI_LOCKED(vp, __func__);
 	else
 		ASSERT_VI_UNLOCKED(vp, __func__);
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	return (ret);
 }
 
 static struct vnode *
 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp, *nvp;
 
 	mtx_assert(&mp->mnt_listmtx, MA_OWNED);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 restart:
 	vp = TAILQ_NEXT(*mvp, v_actfreelist);
 	while (vp != NULL) {
 		if (vp->v_type == VMARKER) {
 			vp = TAILQ_NEXT(vp, v_actfreelist);
 			continue;
 		}
 		/*
 		 * Try-lock because this is the wrong lock order.  If that does
 		 * not succeed, drop the mount vnode list lock and try to
 		 * reacquire it and the vnode interlock in the right order.
 		 */
 		if (!VI_TRYLOCK(vp) &&
 		    !mnt_vnode_next_active_relock(*mvp, mp, vp))
 			goto restart;
 		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
 		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
 		    ("alien vnode on the active list %p %p", vp, mp));
 		if (vp->v_mount == mp && !VN_IS_DOOMED(vp))
 			break;
 		nvp = TAILQ_NEXT(vp, v_actfreelist);
 		VI_UNLOCK(vp);
 		vp = nvp;
 	}
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 
 	/* Check if we are done */
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
 	mtx_unlock(&mp->mnt_listmtx);
 	ASSERT_VI_LOCKED(vp, "active iter");
 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
 	return (vp);
 }
 
 struct vnode *
 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (should_yield())
 		kern_yield(PRI_USER);
 	mtx_lock(&mp->mnt_listmtx);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 struct vnode *
 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
 {
 	struct vnode *vp;
 
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	MNT_IUNLOCK(mp);
 	(*mvp)->v_type = VMARKER;
 	(*mvp)->v_mount = mp;
 
 	mtx_lock(&mp->mnt_listmtx);
 	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
 	if (vp == NULL) {
 		mtx_unlock(&mp->mnt_listmtx);
 		mnt_vnode_markerfree_active(mvp, mp);
 		return (NULL);
 	}
 	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
 	return (mnt_vnode_next_active(mvp, mp));
 }
 
 void
 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
 {
 
 	if (*mvp == NULL)
 		return;
 
 	mtx_lock(&mp->mnt_listmtx);
 	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
 	mtx_unlock(&mp->mnt_listmtx);
 	mnt_vnode_markerfree_active(mvp, mp);
 }
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 355789)
+++ head/sys/sys/vnode.h	(revision 355790)
@@ -1,960 +1,968 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 enum vgetstate	{ VGET_HOLDCNT, VGET_USECOUNT };
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type:8;			/* u vnode type */
 	short	v_irflag;			/* i frequently read flags */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_actfreelist;	/* l vnode active/free lists */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_int	v_iflag;			/* i vnode flags (see below) */
 	u_int	v_vflag;			/* v vnode flags */
 	u_int	v_mflag;			/* l mnt-specific vnode flags */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	u_int	v_hash;
 	const char *v_tag;			/* u type of underlying data */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* unpcb, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VIRF_DOOMED	0x0001	/* This vnode is being recycled */
 
 #define	VI_TEXT_REF	0x0001	/* Text ref grabbed use ref */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_ACTIVE	0x0200	/* This vnode is on the active list */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_VMSIZEVNLOCK	0x0020	/* object size check requires vnode lock */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 
 #define	VMP_TMPMNTFREELIST	0x0001	/* Vnode is on mnt's tmp free list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	int	vdesc_vop_offset;
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 #define	VN_OPEN_INVFS		0x00000008
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(int newhashsize);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp, bool force);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(u_int count);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf,
 	    u_int *buflen);
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int flags, struct thread *td);
 enum vgetstate	vget_prep(struct vnode *vp);
 int	vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 void	vholdnz(struct vnode *);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 int	vn_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_need_pageq_flush(struct vnode *vp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	_vn_lock(struct vnode *vp, int flags, char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 	    struct ucred *cred);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_tryrlock(vp, start, end)				\
 	rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_trywlock(vp, start, end)				\
 	rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 int	vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 void	vnlru_free(int, struct vfsops *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdioctl(struct vop_ioctl_args *);
 int	vop_stdneed_inactive(struct vop_need_inactive_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_lock(struct vop_lock1_args *);
 int	vop_unlock(struct vop_unlock_args *);
 int	vop_islocked(struct vop_islocked_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_post(void *a, int rc);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_post(void *a, int rc);
 int	vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_strategy_pre(void *a);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 void	vop_unlock_post(void *a, int rc);
 void	vop_need_inactive_pre(void *a);
 void	vop_need_inactive_post(void *a, int rc);
 #else
 #define	vop_strategy_pre(x)	do { } while (0)
 #define	vop_lock_pre(x)		do { } while (0)
 #define	vop_lock_post(x, y)	do { } while (0)
 #define	vop_unlock_pre(x)	do { } while (0)
 #define	vop_unlock_post(x, y)	do { } while (0)
 #define	vop_need_inactive_pre(x)	do { } while (0)
 #define	vop_need_inactive_post(x, y)	do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 #define	VN_IS_DOOMED(vp)	((vp)->v_irflag & VIRF_DOOMED)
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefl(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(int newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct ucred *cred);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 int vfs_emptydir(struct vnode *vp);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
+#include <sys/kernel.h>
+
+#define VFS_VOP_VECTOR_REGISTER(vnodeops) \
+	SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
+	    vfs_vector_op_register, &vnodeops)
+
+void	vfs_vector_op_register(struct vop_vector *vop);
+
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/tools/vnode_if.awk
===================================================================
--- head/sys/tools/vnode_if.awk	(revision 355789)
+++ head/sys/tools/vnode_if.awk	(revision 355790)
@@ -1,439 +1,472 @@
 #!/usr/bin/awk -f
 
 #-
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Copyright (c) 1992, 1993
 #	The Regents of the University of California.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 # 3. Neither the name of the University nor the names of its contributors
 #    may be used to endorse or promote products derived from this software
 #    without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 
 #
 #	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
 # $FreeBSD$
 #
 # Script to produce VFS front-end sugar.
 #
 # usage: vnode_if.awk <srcfile> [-c | -h | -p | -q]
 #	(where <srcfile> is currently /sys/kern/vnode_if.src)
 #	The source file must have a .src extension
 #
 
 function usage()
 {
 	print "usage: vnode_if.awk <srcfile> [-c|-h|-p|-q]";
 	exit 1;
 }
 
 function die(msg, what)
 {
 	printf srcfile "(" fnr "): " > "/dev/stderr";
 	printf msg "\n", what > "/dev/stderr";
 	exit 1;
 }
 
 function t_spc(type)
 {
 	# Append a space if the type is not a pointer
 	return (type ~ /\*$/) ? type : type " ";
 }
 
 # These are just for convenience ...
 function printc(s) {print s > cfile;}
 function printh(s) {print s > hfile;}
 function printp(s) {print s > pfile;}
 function printq(s) {print s > qfile;}
 
 function add_debug_code(name, arg, pos, ind)
 {
 	if (arg == "vpp")
 		star = "*";
 	else
 		star = "";
 	if (lockdata[name, arg, pos] && (lockdata[name, arg, pos] != "-")) {
 		printc(ind"ASSERT_VI_UNLOCKED("star"a->a_"arg", \""uname"\");");
 		# Add assertions for locking
 		if (lockdata[name, arg, pos] == "L")
 			printc(ind"ASSERT_VOP_LOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (lockdata[name, arg, pos] == "U")
 			printc(ind"ASSERT_VOP_UNLOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (lockdata[name, arg, pos] == "E")
 			printc(ind"ASSERT_VOP_ELOCKED(" star "a->a_"arg", \""uname"\");");
 		else if (0) {
 			# XXX More checks!
 		}
 	}
 }
 
 function add_pre(name)
 {
 	if (lockdata[name, "pre"]) {
 		printc("\t"lockdata[name, "pre"]"(a);");
 	}
 }
 
 function add_post(name)
 {
 	if (lockdata[name, "post"]) {
 		printc("\t"lockdata[name, "post"]"(a, rc);");
 	}
 }
 
 function find_arg_with_type (type)
 {
 	for (jj = 0; jj < numargs; jj++) {
 		if (types[jj] == type) {
 			return "VOPARG_OFFSETOF(struct " \
 			    name "_args,a_" args[jj] ")";
 		}
 	}
 
 	return "VDESC_NO_OFFSET";
 }
 
 BEGIN{
 
 # Process the command line
 for (i = 1; i < ARGC; i++) {
 	arg = ARGV[i];
 	if (arg !~ /^-[chpq]+$/ && arg !~ /\.src$/)
 		usage();
 	if (arg ~ /^-.*c/)
 		cfile = "vnode_if.c";
 	if (arg ~ /^-.*h/)
 		hfile = "vnode_if.h";
 	if (arg ~ /^-.*p/)
 		pfile = "vnode_if_newproto.h";
 	if (arg ~ /^-.*q/)
 		qfile = "vnode_if_typedef.h";
 	if (arg ~ /\.src$/)
 		srcfile = arg;
 }
 ARGC = 1;
 
 if (!cfile && !hfile && !pfile && !qfile)
 	exit 0;
 
 if (!srcfile)
 	usage();
 
 # Avoid a literal generated file tag here.
 generated = "@" "generated";
 
 common_head = \
     "/*\n" \
     " * This file is " generated " automatically.\n" \
     " * Do not modify anything in here by hand.\n" \
     " *\n" \
     " * Created from $FreeBSD$\n" \
     " */\n" \
     "\n";
 
 if (pfile) {
 	printp(common_head)
 	printp("struct vop_vector {")
 	printp("\tstruct vop_vector\t*vop_default;")
 	printp("\tvop_bypass_t\t*vop_bypass;")
 }
 
 if (qfile) {
 	printq(common_head)
 }
 
 if (hfile) {
 	printh(common_head "extern struct vnodeop_desc vop_default_desc;");
 	printh("#include \"vnode_if_typedef.h\"")
 	printh("#include \"vnode_if_newproto.h\"")
 }
 
 if (cfile) {
 	printc(common_head \
 	    "#include <sys/param.h>\n" \
 	    "#include <sys/event.h>\n" \
 	    "#include <sys/kernel.h>\n" \
 	    "#include <sys/mount.h>\n" \
 	    "#include <sys/sdt.h>\n" \
 	    "#include <sys/signalvar.h>\n" \
 	    "#include <sys/systm.h>\n" \
 	    "#include <sys/vnode.h>\n" \
 	    "\n" \
 	    "SDT_PROVIDER_DECLARE(vfs);\n" \
 	    "\n" \
 	    "struct vnodeop_desc vop_default_desc = {\n" \
 	    "	\"default\",\n" \
 	    "	0,\n" \
 	    "   0,\n" \
 	    "	(vop_bypass_t *)vop_panic,\n" \
 	    "	NULL,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "	VDESC_NO_OFFSET,\n" \
 	    "};\n");
 }
 
 while ((getline < srcfile) > 0) {
 	fnr++;
 	if (NF == 0)
 		continue;
 	if ($1 ~ /^%%/) {
 		if (NF != 6 ||
 		    $2 !~ /^[a-z_]+$/  ||  $3 !~ /^[a-z]+$/  ||
 		    $4 !~ /^.$/  ||  $5 !~ /^.$/  ||  $6 !~ /^.$/) {
 			die("Invalid %s construction", "%%");
 			continue;
 		}
 		lockdata["vop_" $2, $3, "Entry"] = $4;
 		lockdata["vop_" $2, $3, "OK"]    = $5;
 		lockdata["vop_" $2, $3, "Error"] = $6;			
 		continue;
 	}
 
 	if ($1 ~ /^%!/) {
 		if (NF != 4 ||
 		    ($3 != "pre" && $3 != "post")) {
 			die("Invalid %s construction", "%!");
 			continue;
 		}
 		lockdata["vop_" $2, $3] = $4;
 		continue;
 	}
 	if ($1 ~ /^#/)
 		continue;
 
 	# Get the function name.
 	name = $1;
 	uname = toupper(name);
 
 	# Get the function arguments.
 	for (numargs = 0; ; ++numargs) {
 		if ((getline < srcfile) <= 0) {
 			die("Unable to read through the arguments for \"%s\"",
 			    name);
 		}
 		fnr++;
 		if ($1 ~ /^\};/)
 			break;
 
 		# Delete comments, if any.
 		gsub (/\/\*.*\*\//, "");
 
 		# Condense whitespace and delete leading/trailing space.
 		gsub(/[[:space:]]+/, " ");
 		sub(/^ /, "");
 		sub(/ $/, "");
 
 		# Pick off direction.
 		if ($1 != "INOUT" && $1 != "IN" && $1 != "OUT")
 			die("No IN/OUT direction for \"%s\".", $0);
 		dirs[numargs] = $1;
 		sub(/^[A-Z]* /, "");
 
 		if ((reles[numargs] = $1) == "WILLRELE")
 			sub(/^[A-Z]* /, "");
 		else
 			reles[numargs] = "WONTRELE";
 
 		# kill trailing ;
 		if (sub(/;$/, "") < 1)
 			die("Missing end-of-line ; in \"%s\".", $0);
 
 		# pick off variable name
 		if ((argp = match($0, /[A-Za-z0-9_]+$/)) < 1)
 			die("Missing var name \"a_foo\" in \"%s\".", $0);
 		args[numargs] = substr($0, argp);
 		$0 = substr($0, 1, argp - 1);
 
 		# what is left must be type
 		# remove trailing space (if any)
 		sub(/ $/, "");
 		types[numargs] = $0;
 	}
 	if (numargs > 4)
 		ctrargs = 4;
 	else
 		ctrargs = numargs;
 	ctrstr = ctrargs "(KTR_VOP, \"VOP\", \"" uname "\", (uintptr_t)a,\n\t    "; 
 	ctrstr = ctrstr "\"" args[0] ":0x%jX\", (uintptr_t)a->a_" args[0];
 	for (i = 1; i < ctrargs; ++i)
 		ctrstr = ctrstr ", \"" args[i] ":0x%jX\", a->a_" args[i];
 	ctrstr = ctrstr ");";
 
 	if (pfile) {
 		printp("\t"name"_t\t*"name";")
 	}
 	if (qfile) {
 		printq("struct "name"_args;")
 		printq("typedef int "name"_t(struct "name"_args *);\n")
 	}
 
 	if (hfile) {
 		# Print out the vop_F_args structure.
 		printh("struct "name"_args {\n\tstruct vop_generic_args a_gen;");
 		for (i = 0; i < numargs; ++i)
 			printh("\t" t_spc(types[i]) "a_" args[i] ";");
 		printh("};");
 		printh("");
 
 		# Print out extern declaration.
 		printh("extern struct vnodeop_desc " name "_desc;");
 		printh("");
 
 		# Print out function prototypes.
 		printh("int " uname "_AP(struct " name "_args *);");
 		printh("int " uname "_APV(struct vop_vector *vop, struct " name "_args *);");
 		printh("");
 		printh("static __inline int " uname "(");
 		for (i = 0; i < numargs; ++i) {
 			printh("\t" t_spc(types[i]) args[i] \
 			    (i < numargs - 1 ? "," : ")"));
 		}
 		printh("{");
 		printh("\tstruct " name "_args a;");
 		printh("");
 		printh("\ta.a_gen.a_desc = &" name "_desc;");
 		for (i = 0; i < numargs; ++i)
 			printh("\ta.a_" args[i] " = " args[i] ";");
 		printh("\treturn (" uname "_APV("args[0]"->v_op, &a));");
 		printh("}");
 
 		printh("");
 	}
 
 	if (cfile) {
+		funcarr[name] = 1;
 		# Print out the vop_F_vp_offsets structure.  This all depends
 		# on naming conventions and nothing else.
 		printc("static int " name "_vp_offsets[] = {");
 		# as a side effect, figure out the releflags
 		releflags = "";
 		vpnum = 0;
 		for (i = 0; i < numargs; i++) {
 			if (types[i] == "struct vnode *") {
 				printc("\tVOPARG_OFFSETOF(struct " name \
 				    "_args,a_" args[i] "),");
 				if (reles[i] == "WILLRELE") {
 					releflags = releflags \
 					    "|VDESC_VP" vpnum "_WILLRELE";
 				}
 				vpnum++;
 			}
 		}
 
 		sub(/^\|/, "", releflags);
 		printc("\tVDESC_NO_OFFSET");
 		printc("};");
 
 		printc("\n");
 		printc("SDT_PROBE_DEFINE2(vfs, vop, " name ", entry, \"struct vnode *\", \"struct " name "_args *\");\n");
 		printc("SDT_PROBE_DEFINE3(vfs, vop, " name ", return, \"struct vnode *\", \"struct " name "_args *\", \"int\");\n");
 
 		# Print out function.
 		printc("\nint\n" uname "_AP(struct " name "_args *a)");
 		printc("{");
 		printc("");
 		printc("\treturn(" uname "_APV(a->a_" args[0] "->v_op, a));");
 		printc("}");
 		printc("\nint\n" uname "_APV(struct vop_vector *vop, struct " name "_args *a)");
 		printc("{");
 		printc("\tint rc;");
 		printc("");
 		printc("\tVNASSERT(a->a_gen.a_desc == &" name "_desc, a->a_" args[0]",");
 		printc("\t    (\"Wrong a_desc in " name "(%p, %p)\", a->a_" args[0]", a));");
-		printc("\twhile(vop != NULL && \\");
-		printc("\t    vop->"name" == NULL && vop->vop_bypass == NULL)")
-		printc("\t\tvop = vop->vop_default;")
 		printc("\tVNASSERT(vop != NULL, a->a_" args[0]", (\"No "name"(%p, %p)\", a->a_" args[0]", a));")
 		printc("\tSDT_PROBE2(vfs, vop, " name ", entry, a->a_" args[0] ", a);\n");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "Entry", "\t");
 		printc("\tKTR_START" ctrstr);
 		add_pre(name);
 		printc("\tif (vop->"name" != NULL)")
 		printc("\t\trc = vop->"name"(a);")
 		printc("\telse")
 		printc("\t\trc = vop->vop_bypass(&a->a_gen);")
 		printc("\tSDT_PROBE3(vfs, vop, " name ", return, a->a_" args[0] ", a, rc);\n");
 		printc("\tif (rc == 0) {");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "OK", "\t\t");
 		printc("\t} else {");
 		for (i = 0; i < numargs; ++i)
 			add_debug_code(name, args[i], "Error", "\t\t");
 		printc("\t}");
 		add_post(name);
 		printc("\tKTR_STOP" ctrstr);
 		printc("\treturn (rc);");
 		printc("}\n");
 
 		# Print out the vnodeop_desc structure.
 		printc("struct vnodeop_desc " name "_desc = {");
 		# printable name
 		printc("\t\"" name "\",");
 		# flags
 		vppwillrele = "";
 		for (i = 0; i < numargs; i++) {
 			if (types[i] == "struct vnode **" && \
 			    reles[i] == "WILLRELE") {
 				vppwillrele = "|VDESC_VPP_WILLRELE";
 			}
 		}
 
 		if (!releflags)
 			releflags = "0";
 		printc("\t" releflags vppwillrele ",");
 
 		# index in struct vop_vector
 		printc("\t__offsetof(struct vop_vector, " name "),");
 		# function to call
 		printc("\t(vop_bypass_t *)" uname "_AP,");
 		# vp offsets
 		printc("\t" name "_vp_offsets,");
 		# vpp (if any)
 		printc("\t" find_arg_with_type("struct vnode **") ",");
 		# cred (if any)
 		printc("\t" find_arg_with_type("struct ucred *") ",");
 		# thread (if any)
 		printc("\t" find_arg_with_type("struct thread *") ",");
 		# componentname
 		printc("\t" find_arg_with_type("struct componentname *") ",");
 		# transport layer information
 		printc("};\n");
 	}
 }
- 
-if (pfile)
+
+if (cfile) {
+	printc("void");
+	printc("vfs_vector_op_register(struct vop_vector *orig_vop)");
+	printc("{");
+	printc("\tstruct vop_vector *vop;");
+	printc("");
+	printc("\tif (orig_vop->registered)");
+	printc("\t\tpanic(\"%s: vop_vector %p already registered\",")
+	printc("\t\t    __func__, orig_vop);");
+	printc("");
+	for (name in funcarr) {
+		printc("\tvop = orig_vop;");
+		printc("\twhile (vop != NULL && \\");
+		printc("\t    vop->"name" == NULL && vop->vop_bypass == NULL)")
+		printc("\t\tvop = vop->vop_default;")
+		printc("\tif (vop != NULL)");
+		printc("\t\torig_vop->"name" = vop->"name";");
+		printc("");
+	}
+	printc("\tvop = orig_vop;");
+	printc("\twhile (vop != NULL && vop->vop_bypass == NULL)")
+	printc("\t\tvop = vop->vop_default;")
+	printc("\tif (vop != NULL)");
+	printc("\t\torig_vop->vop_bypass = vop->vop_bypass;");
+	printc("");
+	printc("\torig_vop->registered = true;");
+	printc("}")
+}
+
+if (hfile) {
+	printh("void vfs_vector_op_register(struct vop_vector *orig_vop);");
+}
+
+if (pfile) {
+	printp("\tbool\tregistered;")
 	printp("};")
+}
  
 if (hfile)
 	close(hfile);
 if (cfile)
 	close(cfile);
 if (pfile)
 	close(pfile);
 close(srcfile);
 
 exit 0;
 
 }
Index: head/sys/ufs/ffs/ffs_vnops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vnops.c	(revision 355789)
+++ head/sys/ufs/ffs/ffs_vnops.c	(revision 355790)
@@ -1,1767 +1,1771 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
  *
  * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Marshall
  * Kirk McKusick and Network Associates Laboratories, the Security
  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  * research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
  * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
  *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/extattr.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 #include "opt_directio.h"
 #include "opt_ffs.h"
 
 #define	ALIGNED_TO(ptr, s)	\
 	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
 
 #ifdef DIRECTIO
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
 static vop_fdatasync_t	ffs_fdatasync;
 static vop_fsync_t	ffs_fsync;
 static vop_getpages_t	ffs_getpages;
 static vop_getpages_async_t	ffs_getpages_async;
 static vop_lock1_t	ffs_lock;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
 static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
 static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
 		    struct ucred *cred);
 static vop_strategy_t	ffsext_strategy;
 static vop_closeextattr_t	ffs_closeextattr;
 static vop_deleteextattr_t	ffs_deleteextattr;
 static vop_getextattr_t	ffs_getextattr;
 static vop_listextattr_t	ffs_listextattr;
 static vop_openextattr_t	ffs_openextattr;
 static vop_setextattr_t	ffs_setextattr;
 static vop_vptofh_t	ffs_vptofh;
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops1 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
 	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	ffs_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
 	.vop_vptofh =		ffs_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
 
 struct vop_vector ffs_fifoops1 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
 	.vop_lock1 =		ffs_lock,
 	.vop_vptofh =		ffs_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops2 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
 	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	ffs_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_write =		ffs_write,
 	.vop_closeextattr =	ffs_closeextattr,
 	.vop_deleteextattr =	ffs_deleteextattr,
 	.vop_getextattr =	ffs_getextattr,
 	.vop_listextattr =	ffs_listextattr,
 	.vop_openextattr =	ffs_openextattr,
 	.vop_setextattr =	ffs_setextattr,
 	.vop_vptofh =		ffs_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
 
 struct vop_vector ffs_fifoops2 = {
 	.vop_default =		&ufs_fifoops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
 	.vop_lock1 =		ffs_lock,
 	.vop_reallocblks =	ffs_reallocblks,
 	.vop_strategy =		ffsext_strategy,
 	.vop_closeextattr =	ffs_closeextattr,
 	.vop_deleteextattr =	ffs_deleteextattr,
 	.vop_getextattr =	ffs_getextattr,
 	.vop_listextattr =	ffs_listextattr,
 	.vop_openextattr =	ffs_openextattr,
 	.vop_setextattr =	ffs_setextattr,
 	.vop_vptofh =		ffs_vptofh,
 };
+VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
 
 /*
  * Synch an open file.
  */
 /* ARGSUSED */
 static int
 ffs_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int error;
 
 	vp = ap->a_vp;
 	bo = &vp->v_bufobj;
 retry:
 	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
 	if (error)
 		return (error);
 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
 		error = softdep_fsync(vp);
 		if (error)
 			return (error);
 
 		/*
 		 * The softdep_fsync() function may drop vp lock,
 		 * allowing for dirty buffers to reappear on the
 		 * bo_dirty list. Recheck and resync as needed.
 		 */
 		BO_LOCK(bo);
 		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
 		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
 			BO_UNLOCK(bo);
 			goto retry;
 		}
 		BO_UNLOCK(bo);
 	}
 	return (0);
 }
 
 int
 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 {
 	struct inode *ip;
 	struct bufobj *bo;
 	struct buf *bp, *nbp;
 	ufs_lbn_t lbn;
 	int error, passes;
 	bool still_dirty, wait;
 
 	ip = VTOI(vp);
 	ip->i_flag &= ~IN_NEEDSYNC;
 	bo = &vp->v_bufobj;
 
 	/*
 	 * When doing MNT_WAIT we must first flush all dependencies
 	 * on the inode.
 	 */
 	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
 	    (error = softdep_sync_metadata(vp)) != 0)
 		return (error);
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 	error = 0;
 	passes = 0;
 	wait = false;	/* Always do an async pass first. */
 	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
 	BO_LOCK(bo);
 loop:
 	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 		bp->b_vflags &= ~BV_SCANNED;
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		/*
 		 * Reasons to skip this buffer: it has already been considered
 		 * on this pass, the buffer has dependencies that will cause
 		 * it to be redirtied and it has not already been deferred,
 		 * or it is already being written.
 		 */
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		/*
 		 * Flush indirects in order, if requested.
 		 *
 		 * Note that if only datasync is requested, we can
 		 * skip indirect blocks when softupdates are not
 		 * active.  Otherwise we must flush them with data,
 		 * since dependencies prevent data block writes.
 		 */
 		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
 		    (lbn_level(bp->b_lblkno) >= passes ||
 		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
 			continue;
 		if (bp->b_lblkno > lbn)
 			panic("ffs_syncvnode: syncing truncated data.");
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
 			BO_UNLOCK(bo);
 		} else if (wait) {
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    BO_LOCKPTR(bo)) != 0) {
 				bp->b_vflags &= ~BV_SCANNED;
 				goto next;
 			}
 		} else
 			continue;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("ffs_fsync: not dirty");
 		/*
 		 * Check for dependencies and potentially complete them.
 		 */
 		if (!LIST_EMPTY(&bp->b_dep) &&
 		    (error = softdep_sync_buf(vp, bp,
 		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
 			/* I/O error. */
 			if (error != EBUSY) {
 				BUF_UNLOCK(bp);
 				return (error);
 			}
 			/* If we deferred once, don't defer again. */
 		    	if ((bp->b_flags & B_DEFERRED) == 0) {
 				bp->b_flags |= B_DEFERRED;
 				BUF_UNLOCK(bp);
 				goto next;
 			}
 		}
 		if (wait) {
 			bremfree(bp);
 			if ((error = bwrite(bp)) != 0)
 				return (error);
 		} else if ((bp->b_flags & B_CLUSTEROK)) {
 			(void) vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			(void) bawrite(bp);
 		}
 next:
 		/*
 		 * Since we may have slept during the I/O, we need
 		 * to start from a known point.
 		 */
 		BO_LOCK(bo);
 		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
 	}
 	if (waitfor != MNT_WAIT) {
 		BO_UNLOCK(bo);
 		if ((flags & NO_INO_UPDT) != 0)
 			return (0);
 		else
 			return (ffs_update(vp, 0));
 	}
 	/* Drain IO to see if we're done. */
 	bufobj_wwait(bo, 0, 0);
 	/*
 	 * Block devices associated with filesystems may have new I/O
 	 * requests posted for them even if the vnode is locked, so no
 	 * amount of trying will get them clean.  We make several passes
 	 * as a best effort.
 	 *
 	 * Regular files may need multiple passes to flush all dependency
 	 * work as it is possible that we must write once per indirect
 	 * level, once for the leaf, and once for the inode and each of
 	 * these will be done with one sync and one async pass.
 	 */
 	if (bo->bo_dirty.bv_cnt > 0) {
 		if ((flags & DATA_ONLY) == 0) {
 			still_dirty = true;
 		} else {
 			/*
 			 * For data-only sync, dirty indirect buffers
 			 * are ignored.
 			 */
 			still_dirty = false;
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 				if (bp->b_lblkno > -UFS_NDADDR) {
 					still_dirty = true;
 					break;
 				}
 			}
 		}
 
 		if (still_dirty) {
 			/* Write the inode after sync passes to flush deps. */
 			if (wait && DOINGSOFTDEP(vp) &&
 			    (flags & NO_INO_UPDT) == 0) {
 				BO_UNLOCK(bo);
 				ffs_update(vp, 1);
 				BO_LOCK(bo);
 			}
 			/* switch between sync/async. */
 			wait = !wait;
 			if (wait || ++passes < UFS_NIADDR + 2)
 				goto loop;
 		}
 	}
 	BO_UNLOCK(bo);
 	error = 0;
 	if ((flags & DATA_ONLY) == 0) {
 		if ((flags & NO_INO_UPDT) == 0)
 			error = ffs_update(vp, 1);
 		if (DOINGSUJ(vp))
 			softdep_journal_fsync(VTOI(vp));
 	}
 	return (error);
 }
 
 static int
 ffs_fdatasync(struct vop_fdatasync_args *ap)
 {
 
 	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
 }
 
 static int
 ffs_lock(ap)
 	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		struct thread *a_td;
 		char *file;
 		int line;
 	} */ *ap;
 {
 #ifndef NO_FFS_SNAPSHOT
 	struct vnode *vp;
 	int flags;
 	struct lock *lkp;
 	int result;
 
 	switch (ap->a_flags & LK_TYPE_MASK) {
 	case LK_SHARED:
 	case LK_UPGRADE:
 	case LK_EXCLUSIVE:
 		vp = ap->a_vp;
 		flags = ap->a_flags;
 		for (;;) {
 #ifdef DEBUG_VFS_LOCKS
 			KASSERT(vp->v_holdcnt != 0,
 			    ("ffs_lock %p: zero hold count", vp));
 #endif
 			lkp = vp->v_vnlock;
 			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 			    ap->a_file, ap->a_line);
 			if (lkp == vp->v_vnlock || result != 0)
 				break;
 			/*
 			 * Apparent success, except that the vnode
 			 * mutated between snapshot file vnode and
 			 * regular file vnode while this process
 			 * slept.  The lock currently held is not the
 			 * right lock.  Release it, and try to get the
 			 * new lock.
 			 */
 			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
 			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
 			    ap->a_file, ap->a_line);
 			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
 			    (LK_INTERLOCK | LK_NOWAIT))
 				return (EBUSY);
 			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
 				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
 			flags &= ~LK_INTERLOCK;
 		}
 		break;
 	default:
 		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
 	}
 	return (result);
 #else
 	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
 #endif
 }
 
 static int
 ffs_read_hole(struct uio *uio, long xfersize, long *size)
 {
 	ssize_t saved_resid, tlen;
 	int error;
 
 	while (xfersize > 0) {
 		tlen = min(xfersize, ZERO_REGION_SIZE);
 		saved_resid = uio->uio_resid;
 		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
 		    tlen, uio);
 		if (error != 0)
 			return (error);
 		tlen = saved_resid - uio->uio_resid;
 		xfersize -= tlen;
 		*size -= tlen;
 	}
 	return (0);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 ffs_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct inode *ip;
 	struct uio *uio;
 	struct fs *fs;
 	struct buf *bp;
 	ufs_lbn_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	ssize_t orig_resid;
 	int bflag, error, ioflag, seqcount;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
 	if (ap->a_ioflag & IO_EXT)
 #ifdef notyet
 		return (ffs_extread(vp, uio, ioflag));
 #else
 		panic("ffs_read+IO_EXT");
 #endif
 #ifdef DIRECTIO
 	if ((ioflag & IO_DIRECT) != 0) {
 		int workdone;
 
 		error = ffs_rawread(vp, uio, &workdone);
 		if (error != 0 || workdone != 0)
 			return error;
 	}
 #endif
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_READ)
 		panic("ffs_read: mode");
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("ffs_read: short symlink");
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("ffs_read: type %d",  vp->v_type);
 #endif
 	orig_resid = uio->uio_resid;
 	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
 	if (orig_resid == 0)
 		return (0);
 	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
 	fs = ITOFS(ip);
 	if (uio->uio_offset < ip->i_size &&
 	    uio->uio_offset >= fs->fs_maxfilesize)
 		return (EOVERFLOW);
 
 	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 
 		/*
 		 * size of buffer.  The buffer representing the
 		 * end of the file is rounded up to the size of
 		 * the block type ( fragment or full block,
 		 * depending ).
 		 */
 		size = blksize(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 
 		/*
 		 * The amount we want to transfer in this iteration is
 		 * one FS block less the amount of the data before
 		 * our startpoint (duh!)
 		 */
 		xfersize = fs->fs_bsize - blkoffset;
 
 		/*
 		 * But if we actually want less than the block,
 		 * or the file doesn't have a whole block more of data,
 		 * then use the lesser number.
 		 */
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) >= ip->i_size) {
 			/*
 			 * Don't do readahead if this is the end of the file.
 			 */
 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			/*
 			 * Otherwise if we are allowed to cluster,
 			 * grab as much as we can.
 			 *
 			 * XXX  This may not be a win if we are not
 			 * doing sequential access.
 			 */
 			error = cluster_read(vp, ip->i_size, lbn,
 			    size, NOCRED, blkoffset + uio->uio_resid,
 			    seqcount, bflag, &bp);
 		} else if (seqcount > 1) {
 			/*
 			 * If we are NOT allowed to cluster, then
 			 * if we appear to be acting sequentially,
 			 * fire off a request for a readahead
 			 * as well as a read. Note that the 4th and 5th
 			 * arguments point to arrays of the size specified in
 			 * the 6th argument.
 			 */
 			u_int nextsize = blksize(fs, ip, nextlbn);
 			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
 			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
 		} else {
 			/*
 			 * Failing all of the above, just read what the
 			 * user asked for. Interestingly, the same as
 			 * the first option above.
 			 */
 			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
 		}
 		if (error == EJUSTRETURN) {
 			error = ffs_read_hole(uio, xfersize, &size);
 			if (error == 0)
 				continue;
 		}
 		if (error != 0) {
 			brelse(bp);
 			bp = NULL;
 			break;
 		}
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 
 		if (buf_mapped(bp)) {
 			error = vn_io_fault_uiomove((char *)bp->b_data +
 			    blkoffset, (int)xfersize, uio);
 		} else {
 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 			    (int)xfersize, uio);
 		}
 		if (error)
 			break;
 
 		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
 	 * This can only happen in the case of an error
 	 * because the loop above resets bp to NULL on each iteration
 	 * and on normal completion has not set a new value into it.
 	 * so it must have come from a 'break' statement
 	 */
 	if (bp != NULL)
 		vfs_bio_brelse(bp, ioflag);
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
 	    (ip->i_flag & IN_ACCESS) == 0) {
 		VI_LOCK(vp);
 		ip->i_flag |= IN_ACCESS;
 		VI_UNLOCK(vp);
 	}
 	return (error);
 }
 
 /*
  * Vnode op for writing.
  */
 static int
 ffs_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct inode *ip;
 	struct fs *fs;
 	struct buf *bp;
 	ufs_lbn_t lbn;
 	off_t osize;
 	ssize_t resid;
 	int seqcount;
 	int blkoffset, error, flags, ioflag, size, xfersize;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
 	if (ap->a_ioflag & IO_EXT)
 #ifdef notyet
 		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
 #else
 		panic("ffs_write+IO_EXT");
 #endif
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_WRITE)
 		panic("ffs_write: mode");
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		panic("ffs_write: dir write");
 		break;
 	default:
 		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 			(int)uio->uio_offset,
 			(int)uio->uio_resid
 		);
 	}
 
 	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
 	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
 	fs = ITOFS(ip);
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 		return (EFBIG);
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	if (seqcount > BA_SEQMAX)
 		flags = BA_SEQMAX << BA_SEQSHIFT;
 	else
 		flags = seqcount << BA_SEQSHIFT;
 	if (ioflag & IO_SYNC)
 		flags |= IO_SYNC;
 	flags |= BA_UNMAPPED;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->fs_bsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 
 		/*
 		 * We must perform a read-before-write if the transfer size
 		 * does not cover the entire buffer.
 		 */
 		if (fs->fs_bsize > xfersize)
 			flags |= BA_CLRBUF;
 		else
 			flags &= ~BA_CLRBUF;
 /* XXX is uio->uio_offset the right thing here? */
 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 		    ap->a_cred, flags, &bp);
 		if (error != 0) {
 			vnode_pager_setsize(vp, ip->i_size);
 			break;
 		}
 		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 			bp->b_flags |= B_NOCACHE;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
 			DIP_SET(ip, i_size, ip->i_size);
 		}
 
 		size = blksize(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		if (buf_mapped(bp)) {
 			error = vn_io_fault_uiomove((char *)bp->b_data +
 			    blkoffset, (int)xfersize, uio);
 		} else {
 			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
 			    (int)xfersize, uio);
 		}
 		/*
 		 * If the buffer is not already filled and we encounter an
 		 * error while trying to fill it, we have to clear out any
 		 * garbage data from the pages instantiated for the buffer.
 		 * If we do not, a failed uiomove() during a write can leave
 		 * the prior contents of the pages exposed to a userland mmap.
 		 *
 		 * Note that we need only clear buffers with a transfer size
 		 * equal to the block size because buffers with a shorter
 		 * transfer size were cleared above by the call to UFS_BALLOC()
 		 * with the BA_CLRBUF flag set.
 		 *
 		 * If the source region for uiomove identically mmaps the
 		 * buffer, uiomove() performed the NOP copy, and the buffer
 		 * content remains valid because the page fault handler
 		 * validated the pages.
 		 */
 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 		    fs->fs_bsize == xfersize)
 			vfs_bio_clrbuf(bp);
 
 		vfs_bio_set_flags(bp, ioflag);
 
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
 		 * asynchronously.  Otherwise try to cluster, and if that
 		 * doesn't do it then either do an async write (if O_DIRECT),
 		 * or a delayed write (if not).
 		 */
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() ||
 			    buf_dirty_count_severe() ||
 			    (ioflag & IO_ASYNC)) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else if (xfersize + blkoffset == fs->fs_bsize) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
 				cluster_write(vp, bp, ip->i_size, seqcount,
 				    GB_UNMAPPED);
 			} else {
 				bawrite(bp);
 			}
 		} else if (ioflag & IO_DIRECT) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		if (error || xfersize == 0)
 			break;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 	    ap->a_cred) {
 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			DIP_SET(ip, i_mode, ip->i_mode);
 		}
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)ffs_truncate(vp, osize,
 			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 		error = ffs_update(vp, 1);
 	return (error);
 }
 
 /*
  * Extended attribute area reading.
  */
 static int
 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
 	struct fs *fs;
 	struct buf *bp;
 	ufs_lbn_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	ssize_t orig_resid;
 	int error;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	dp = ip->i_din2;
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_extread: mode");
 
 #endif
 	orig_resid = uio->uio_resid;
 	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
 	if (orig_resid == 0)
 		return (0);
 	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 
 		/*
 		 * size of buffer.  The buffer representing the
 		 * end of the file is rounded up to the size of
 		 * the block type ( fragment or full block,
 		 * depending ).
 		 */
 		size = sblksize(fs, dp->di_extsize, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 
 		/*
 		 * The amount we want to transfer in this iteration is
 		 * one FS block less the amount of the data before
 		 * our startpoint (duh!)
 		 */
 		xfersize = fs->fs_bsize - blkoffset;
 
 		/*
 		 * But if we actually want less than the block,
 		 * or the file doesn't have a whole block more of data,
 		 * then use the lesser number.
 		 */
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
 			/*
 			 * Don't do readahead if this is the end of the info.
 			 */
 			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
 		} else {
 			/*
 			 * If we have a second block, then
 			 * fire off a request for a readahead
 			 * as well as a read. Note that the 4th and 5th
 			 * arguments point to arrays of the size specified in
 			 * the 6th argument.
 			 */
 			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
 
 			nextlbn = -1 - nextlbn;
 			error = breadn(vp, -1 - lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		}
 		if (error) {
 			brelse(bp);
 			bp = NULL;
 			break;
 		}
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 
 		error = uiomove((char *)bp->b_data + blkoffset,
 					(int)xfersize, uio);
 		if (error)
 			break;
 		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
 	 * This can only happen in the case of an error
 	 * because the loop above resets bp to NULL on each iteration
 	 * and on normal completion has not set a new value into it.
 	 * so it must have come from a 'break' statement
 	 */
 	if (bp != NULL)
 		vfs_bio_brelse(bp, ioflag);
 	return (error);
 }
 
 /*
  * Extended attribute area writing.
  */
 static int
 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
 	struct fs *fs;
 	struct buf *bp;
 	ufs_lbn_t lbn;
 	off_t osize;
 	ssize_t resid;
 	int blkoffset, error, flags, size, xfersize;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	dp = ip->i_din2;
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
 		panic("ffs_extwrite: mode");
 #endif
 
 	if (ioflag & IO_APPEND)
 		uio->uio_offset = dp->di_extsize;
 	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
 	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
 	if ((uoff_t)uio->uio_offset + uio->uio_resid >
 	    UFS_NXADDR * fs->fs_bsize)
 		return (EFBIG);
 
 	resid = uio->uio_resid;
 	osize = dp->di_extsize;
 	flags = IO_EXT;
 	if (ioflag & IO_SYNC)
 		flags |= IO_SYNC;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->fs_bsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 
 		/*
 		 * We must perform a read-before-write if the transfer size
 		 * does not cover the entire buffer.
 		 */
 		if (fs->fs_bsize > xfersize)
 			flags |= BA_CLRBUF;
 		else
 			flags &= ~BA_CLRBUF;
 		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 		    ucred, flags, &bp);
 		if (error != 0)
 			break;
 		/*
 		 * If the buffer is not valid we have to clear out any
 		 * garbage data from the pages instantiated for the buffer.
 		 * If we do not, a failed uiomove() during a write can leave
 		 * the prior contents of the pages exposed to a userland
 		 * mmap().  XXX deal with uiomove() errors a better way.
 		 */
 		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 			vfs_bio_clrbuf(bp);
 
 		if (uio->uio_offset + xfersize > dp->di_extsize)
 			dp->di_extsize = uio->uio_offset + xfersize;
 
 		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 
 		vfs_bio_set_flags(bp, ioflag);
 
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
 		 * asynchronously.  Otherwise try to cluster, and if that
 		 * doesn't do it then either do an async write (if O_DIRECT),
 		 * or a delayed write (if not).
 		 */
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() ||
 			    buf_dirty_count_severe() ||
 			    xfersize + blkoffset == fs->fs_bsize ||
 			    (ioflag & (IO_ASYNC | IO_DIRECT)))
 			bawrite(bp);
 		else
 			bdwrite(bp);
 		if (error || xfersize == 0)
 			break;
 		ip->i_flag |= IN_CHANGE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
 		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			dp->di_mode = ip->i_mode;
 		}
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)ffs_truncate(vp, osize,
 			    IO_EXT | (ioflag&IO_SYNC), ucred);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 		error = ffs_update(vp, 1);
 	return (error);
 }
 
 
 /*
  * Vnode operating to retrieve a named extended attribute.
  *
  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
  * the length of the EA, and possibly the pointer to the entry and to the data.
  */
 static int
 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
     struct extattr **eapp, u_char **eac)
 {
 	struct extattr *eap, *eaend;
 	size_t nlen;
 
 	nlen = strlen(name);
 	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
 	eap = (struct extattr *)ptr;
 	eaend = (struct extattr *)(ptr + length);
 	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
 		/* make sure this entry is complete */
 		if (EXTATTR_NEXT(eap) > eaend)
 			break;
 		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
 		    || memcmp(eap->ea_name, name, nlen) != 0)
 			continue;
 		if (eapp != NULL)
 			*eapp = eap;
 		if (eac != NULL)
 			*eac = EXTATTR_CONTENT(eap);
 		return (EXTATTR_CONTENT_SIZE(eap));
 	}
 	return (-1);
 }
 
 static int
 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
 	struct fs *fs;
 	struct uio luio;
 	struct iovec liovec;
 	u_int easize;
 	int error;
 	u_char *eae;
 
 	ip = VTOI(vp);
 	fs = ITOFS(ip);
 	dp = ip->i_din2;
 	easize = dp->di_extsize;
 	if ((uoff_t)easize + extra > UFS_NXADDR * fs->fs_bsize)
 		return (EFBIG);
 
 	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
 
 	liovec.iov_base = eae;
 	liovec.iov_len = easize;
 	luio.uio_iov = &liovec;
 	luio.uio_iovcnt = 1;
 	luio.uio_offset = 0;
 	luio.uio_resid = easize;
 	luio.uio_segflg = UIO_SYSSPACE;
 	luio.uio_rw = UIO_READ;
 	luio.uio_td = td;
 
 	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
 	if (error) {
 		free(eae, M_TEMP);
 		return(error);
 	}
 	*p = eae;
 	return (0);
 }
 
 static void
 ffs_lock_ea(struct vnode *vp)
 {
 	struct inode *ip;
 
 	ip = VTOI(vp);
 	VI_LOCK(vp);
 	while (ip->i_flag & IN_EA_LOCKED) {
 		ip->i_flag |= IN_EA_LOCKWAIT;
 		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
 		    0);
 	}
 	ip->i_flag |= IN_EA_LOCKED;
 	VI_UNLOCK(vp);
 }
 
 static void
 ffs_unlock_ea(struct vnode *vp)
 {
 	struct inode *ip;
 
 	ip = VTOI(vp);
 	VI_LOCK(vp);
 	if (ip->i_flag & IN_EA_LOCKWAIT)
 		wakeup(&ip->i_ea_refs);
 	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
 	VI_UNLOCK(vp);
 }
 
 static int
 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
 {
 	struct inode *ip;
 	struct ufs2_dinode *dp;
 	int error;
 
 	ip = VTOI(vp);
 
 	ffs_lock_ea(vp);
 	if (ip->i_ea_area != NULL) {
 		ip->i_ea_refs++;
 		ffs_unlock_ea(vp);
 		return (0);
 	}
 	dp = ip->i_din2;
 	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
 	if (error) {
 		ffs_unlock_ea(vp);
 		return (error);
 	}
 	ip->i_ea_len = dp->di_extsize;
 	ip->i_ea_error = 0;
 	ip->i_ea_refs++;
 	ffs_unlock_ea(vp);
 	return (0);
 }
 
 /*
  * Vnode extattr transaction commit/abort
  */
 static int
 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
 {
 	struct inode *ip;
 	struct uio luio;
 	struct iovec liovec;
 	int error;
 	struct ufs2_dinode *dp;
 
 	ip = VTOI(vp);
 
 	ffs_lock_ea(vp);
 	if (ip->i_ea_area == NULL) {
 		ffs_unlock_ea(vp);
 		return (EINVAL);
 	}
 	dp = ip->i_din2;
 	error = ip->i_ea_error;
 	if (commit && error == 0) {
 		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
 		if (cred == NOCRED)
 			cred =  vp->v_mount->mnt_cred;
 		liovec.iov_base = ip->i_ea_area;
 		liovec.iov_len = ip->i_ea_len;
 		luio.uio_iov = &liovec;
 		luio.uio_iovcnt = 1;
 		luio.uio_offset = 0;
 		luio.uio_resid = ip->i_ea_len;
 		luio.uio_segflg = UIO_SYSSPACE;
 		luio.uio_rw = UIO_WRITE;
 		luio.uio_td = td;
 		/* XXX: I'm not happy about truncating to zero size */
 		if (ip->i_ea_len < dp->di_extsize)
 			error = ffs_truncate(vp, 0, IO_EXT, cred);
 		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
 	}
 	if (--ip->i_ea_refs == 0) {
 		free(ip->i_ea_area, M_TEMP);
 		ip->i_ea_area = NULL;
 		ip->i_ea_len = 0;
 		ip->i_ea_error = 0;
 	}
 	ffs_unlock_ea(vp);
 	return (error);
 }
 
 /*
  * Vnode extattr strategy routine for fifos.
  *
  * We need to check for a read or write of the external attributes.
  * Otherwise we just fall through and do the usual thing.
  */
 static int
 ffsext_strategy(struct vop_strategy_args *ap)
 /*
 struct vop_strategy_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_vp;
 	struct buf *a_bp;
 };
 */
 {
 	struct vnode *vp;
 	daddr_t lbn;
 
 	vp = ap->a_vp;
 	lbn = ap->a_bp->b_lblkno;
 	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
 		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
 	if (vp->v_type == VFIFO)
 		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
 	panic("spec nodes went here");
 }
 
 /*
  * Vnode extattr transaction commit/abort
  */
 static int
 ffs_openextattr(struct vop_openextattr_args *ap)
 /*
 struct vop_openextattr_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_vp;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
 }
 
 
 /*
  * Vnode extattr transaction commit/abort
  */
 static int
 ffs_closeextattr(struct vop_closeextattr_args *ap)
 /*
 struct vop_closeextattr_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_vp;
 	int a_commit;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
 		return (EROFS);
 
 	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 static int
 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
 /*
 vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct inode *ip;
 	struct extattr *eap;
 	uint32_t ul;
 	int olen, error, i, easize;
 	u_char *eae;
 	void *tmp;
 
 	ip = VTOI(ap->a_vp);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (strlen(ap->a_name) == 0)
 		return (EINVAL);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error) {
 
 		/*
 		 * ffs_lock_ea is not needed there, because the vnode
 		 * must be exclusively locked.
 		 */
 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
 			ip->i_ea_error = error;
 		return (error);
 	}
 
 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 
 	/* CEM: delete could be done in-place instead */
 	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
 	easize = ip->i_ea_len;
 
 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
 	    &eap, NULL);
 	if (olen == -1) {
 		/* delete but nonexistent */
 		free(eae, M_TEMP);
 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 		return (ENOATTR);
 	}
 	ul = eap->ea_length;
 	i = (u_char *)EXTATTR_NEXT(eap) - eae;
 	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
 	easize -= ul;
 
 	tmp = ip->i_ea_area;
 	ip->i_ea_area = eae;
 	ip->i_ea_len = easize;
 	free(tmp, M_TEMP);
 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve a named extended attribute.
  */
 static int
 ffs_getextattr(struct vop_getextattr_args *ap)
 /*
 vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct inode *ip;
 	u_char *eae, *p;
 	unsigned easize;
 	int error, ealen;
 
 	ip = VTOI(ap->a_vp);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 
 	eae = ip->i_ea_area;
 	easize = ip->i_ea_len;
 
 	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
 	    NULL, &p);
 	if (ealen >= 0) {
 		error = 0;
 		if (ap->a_size != NULL)
 			*ap->a_size = ealen;
 		else if (ap->a_uio != NULL)
 			error = uiomove(p, ealen, ap->a_uio);
 	} else
 		error = ENOATTR;
 
 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 ffs_listextattr(struct vop_listextattr_args *ap)
 /*
 vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct inode *ip;
 	struct extattr *eap, *eaend;
 	int error, ealen;
 
 	ip = VTOI(ap->a_vp);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 
 	error = 0;
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
 	eap = (struct extattr *)ip->i_ea_area;
 	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
 	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
 		/* make sure this entry is complete */
 		if (EXTATTR_NEXT(eap) > eaend)
 			break;
 		if (eap->ea_namespace != ap->a_attrnamespace)
 			continue;
 
 		ealen = eap->ea_namelength;
 		if (ap->a_size != NULL)
 			*ap->a_size += ealen + 1;
 		else if (ap->a_uio != NULL)
 			error = uiomove(&eap->ea_namelength, ealen + 1,
 			    ap->a_uio);
 	}
 
 	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 ffs_setextattr(struct vop_setextattr_args *ap)
 /*
 vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 */
 {
 	struct inode *ip;
 	struct fs *fs;
 	struct extattr *eap;
 	uint32_t ealength, ul;
 	ssize_t ealen;
 	int olen, eapad1, eapad2, error, i, easize;
 	u_char *eae;
 	void *tmp;
 
 	ip = VTOI(ap->a_vp);
 	fs = ITOFS(ip);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	if (strlen(ap->a_name) == 0)
 		return (EINVAL);
 
 	/* XXX Now unsupported API to delete EAs using NULL uio. */
 	if (ap->a_uio == NULL)
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (EROFS);
 
 	ealen = ap->a_uio->uio_resid;
 	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
 		return (EINVAL);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error) {
 
 		/*
 		 * ffs_lock_ea is not needed there, because the vnode
 		 * must be exclusively locked.
 		 */
 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
 			ip->i_ea_error = error;
 		return (error);
 	}
 
 	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 	if (error)
 		return (error);
 
 	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
 	eapad1 = roundup2(ealength, 8) - ealength;
 	eapad2 = roundup2(ealen, 8) - ealen;
 	ealength += eapad1 + ealen + eapad2;
 
 	/*
 	 * CEM: rewrites of the same size or smaller could be done in-place
 	 * instead.  (We don't acquire any fine-grained locks in here either,
 	 * so we could also do bigger writes in-place.)
 	 */
 	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
 	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
 	easize = ip->i_ea_len;
 
 	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
 	    &eap, NULL);
         if (olen == -1) {
 		/* new, append at end */
 		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
 		    ("unaligned"));
 		eap = (struct extattr *)(eae + easize);
 		easize += ealength;
 	} else {
 		ul = eap->ea_length;
 		i = (u_char *)EXTATTR_NEXT(eap) - eae;
 		if (ul != ealength) {
 			bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
 			    easize - i);
 			easize += (ealength - ul);
 		}
 	}
 	if (easize > lblktosize(fs, UFS_NXADDR)) {
 		free(eae, M_TEMP);
 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
 			ip->i_ea_error = ENOSPC;
 		return (ENOSPC);
 	}
 	eap->ea_length = ealength;
 	eap->ea_namespace = ap->a_attrnamespace;
 	eap->ea_contentpadlen = eapad2;
 	eap->ea_namelength = strlen(ap->a_name);
 	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
 	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
 	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
 	if (error) {
 		free(eae, M_TEMP);
 		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
 			ip->i_ea_error = error;
 		return (error);
 	}
 	bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
 
 	tmp = ip->i_ea_area;
 	ip->i_ea_area = eae;
 	ip->i_ea_len = easize;
 	free(tmp, M_TEMP);
 	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
 	return (error);
 }
 
 /*
  * Vnode pointer to File handle
  */
 static int
 ffs_vptofh(struct vop_vptofh_args *ap)
 /*
 vop_vptofh {
 	IN struct vnode *a_vp;
 	IN struct fid *a_fhp;
 };
 */
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(ap->a_vp);
 	ufhp = (struct ufid *)ap->a_fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 SYSCTL_DECL(_vfs_ffs);
 static int use_buf_pager = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
     "Always use buffer pager instead of bmap");
 
 static daddr_t
 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 
 	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
 }
 
 static int
 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 
 	return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn));
 }
 
 static int
 ffs_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp;
 	struct ufsmount *um;
 
 	vp = ap->a_vp;
 	um = VFSTOUFS(vp->v_mount);
 
 	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
 }
 
 static int
 ffs_getpages_async(struct vop_getpages_async_args *ap)
 {
 	struct vnode *vp;
 	struct ufsmount *um;
 	int error;
 
 	vp = ap->a_vp;
 	um = VFSTOUFS(vp->v_mount);
 
 	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
 		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
 		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
 
 	error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz);
 	ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
 
 	return (error);
 }
 
Index: head/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- head/sys/ufs/ufs/ufs_vnops.c	(revision 355789)
+++ head/sys/ufs/ufs/ufs_vnops.c	(revision 355790)
@@ -1,2803 +1,2805 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 #include "opt_suiddir.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/refcount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/conf.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <sys/file.h>		/* XXX */
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/acl.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
 #ifdef UFS_GJOURNAL
 #include <ufs/ufs/gjournal.h>
 FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS");
 #endif
 
 #ifdef QUOTA
 FEATURE(ufs_quota, "UFS disk quotas support");
 FEATURE(ufs_quota64, "64bit UFS disk quotas support");
 #endif
 
 #ifdef SUIDDIR
 FEATURE(suiddir,
     "Give all new files in directory the same ownership as the directory");
 #endif
 
 
 #include <ufs/ffs/ffs_extern.h>
 
 static vop_accessx_t	ufs_accessx;
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *);
 static vop_close_t	ufs_close;
 static vop_create_t	ufs_create;
 static vop_getattr_t	ufs_getattr;
 static vop_ioctl_t	ufs_ioctl;
 static vop_link_t	ufs_link;
 static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *);
 static vop_markatime_t	ufs_markatime;
 static vop_mkdir_t	ufs_mkdir;
 static vop_mknod_t	ufs_mknod;
 static vop_open_t	ufs_open;
 static vop_pathconf_t	ufs_pathconf;
 static vop_print_t	ufs_print;
 static vop_readlink_t	ufs_readlink;
 static vop_remove_t	ufs_remove;
 static vop_rename_t	ufs_rename;
 static vop_rmdir_t	ufs_rmdir;
 static vop_setattr_t	ufs_setattr;
 static vop_strategy_t	ufs_strategy;
 static vop_symlink_t	ufs_symlink;
 static vop_whiteout_t	ufs_whiteout;
 static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 
 SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
 
 /*
  * A virgin directory (no blushing please).
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, DT_DIR, 1, ".",
 	0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
 };
 static struct odirtemplate omastertemplate = {
 	0, 12, 1, ".",
 	0, DIRBLKSIZ - 12, 2, ".."
 };
 
 static void
 ufs_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if (UFS_RDONLY(ip))
 		goto out;
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
 	else if (((vp->v_mount->mnt_kern_flag &
 		    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
 		    (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
 	else if (ip->i_flag & IN_ACCESS)
 		ip->i_flag |= IN_LAZYACCESS;
 	vfs_timestamp(&ts);
 	if (ip->i_flag & IN_ACCESS) {
 		DIP_SET(ip, i_atime, ts.tv_sec);
 		DIP_SET(ip, i_atimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_UPDATE) {
 		DIP_SET(ip, i_mtime, ts.tv_sec);
 		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_CHANGE) {
 		DIP_SET(ip, i_ctime, ts.tv_sec);
 		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 		DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1);
 	}
 
  out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ufs_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ufs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create");
 	if (error != 0)
 		return (error);
 	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
 	return (0);
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ufs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod");
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		DIP_SET(ip, i_rdev, vap->va_rdev);
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.  XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Open called.
  */
 /* ARGSUSED */
 static int
 ufs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	ip = VTOI(vp);
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((ip->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 	vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 ufs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ufs_accessx(ap)
 	struct vop_accessx_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
 #ifdef UFS_ACL
 	struct acl *acl;
 	acl_type_t type;
 #endif
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (accmode & VMODIFY_PERMS) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 #ifdef QUOTA
 			/*
 			 * Inode is accounted in the quotas only if struct
 			 * dquot is attached to it. VOP_ACCESS() is called
 			 * from vn_open_cred() and provides a convenient
 			 * point to call getinoquota().  The lock mode is
 			 * exclusive when the file is opening for write.
 			 */
 			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
 				error = getinoquota(ip);
 				if (error != 0)
 					return (error);
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If immutable bit set, nobody gets to write it.  "& ~VADMIN_PERMS"
 	 * permits the owner of the file to remove the IMMUTABLE flag.
 	 */
 	if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) &&
 	    (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) {
 		if (vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			type = ACL_TYPE_NFS4;
 		else
 			type = ACL_TYPE_ACCESS;
 
 		acl = acl_alloc(M_WAITOK);
 		if (type == ACL_TYPE_NFS4)
 			error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td);
 		else
 			error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td);
 		switch (error) {
 		case 0:
 			if (type == ACL_TYPE_NFS4) {
 				error = vaccess_acl_nfs4(vp->v_type, ip->i_uid,
 				    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			} else {
 				error = vfs_unixify_accmode(&accmode);
 				if (error == 0)
 					error = vaccess_acl_posix1e(vp->v_type, ip->i_uid,
 					    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			}
 			break;
 		default:
 			if (error != EOPNOTSUPP)
 				printf(
 "ufs_accessx(): Error retrieving ACL on object (%d).\n",
 				    error);
 			/*
 			 * XXX: Fall back until debugged.  Should
 			 * eventually possibly log an error, and return
 			 * EPERM for safety.
 			 */
 			error = vfs_unixify_accmode(&accmode);
 			if (error == 0)
 				error = vaccess(vp->v_type, ip->i_mode, ip->i_uid,
 				    ip->i_gid, accmode, ap->a_cred, NULL);
 		}
 		acl_free(acl);
 
 		return (error);
 	}
 #endif /* !UFS_ACL */
 	error = vfs_unixify_accmode(&accmode);
 	if (error == 0)
 		error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 		    accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 ufs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	if (I_IS_UFS1(ip)) {
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 	} else {
 		vap->va_atime.tv_sec = ip->i_din2->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
 	}
 	VI_UNLOCK(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	if (I_IS_UFS1(ip)) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks);
 		vap->va_filerev = ip->i_din1->di_modrev;
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec;
 		vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks);
 		vap->va_filerev = ip->i_din2->di_modrev;
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_type = IFTOVT(ip->i_mode);
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ufs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE |
 		    SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE |
 		    UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK |
 		    UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE |
 		    UF_SPARSE | UF_SYSTEM)) != 0)
 			return (EOPNOTSUPP);
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			/* The snapshot flag cannot be toggled. */
 			if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT)
 				return (EPERM);
 		} else {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
 				return (EPERM);
 		}
 		ip->i_flags = vap->va_flags;
 		DIP_SET(ip, i_flags, vap->va_flags);
 		ip->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(vp, 0);
 		if (ip->i_flags & (IMMUTABLE | APPEND))
 			return (error);
 	}
 	/*
 	 * If immutable or append, no one can change any of its attributes
 	 * except the ones already handled (in some cases, file flags
 	 * including the immutability flags themselves for the superuser).
 	 */
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * XXX most of the following special cases should be in
 		 * callers instead of in N filesystems.  The VDIR check
 		 * mostly already is.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			/*
 			 * Truncation should have an effect in these cases.
 			 * Disallow it if the filesystem is read-only or
 			 * the file is being snapshotted.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((ip->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL |
 		    ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0),
 		    cred)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL ||
 	    vap->va_birthtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0)
 			return (EPERM);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_ACCESS;
 			DIP_SET(ip, i_atime, vap->va_atime.tv_sec);
 			DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_UPDATE;
 			DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec);
 			DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec);
 		}
 		if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) {
 			ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec;
 			ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec;
 		}
 		error = UFS_UPDATE(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode &
 		   (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH)))
 			return (EPERM);
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode,
     int file_owner_id, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *aclp;
 
 	aclp = acl_alloc(M_WAITOK);
 	error = ufs_getacl_nfs4_internal(vp, aclp, td);
 	/*
 	 * We don't have to handle EOPNOTSUPP here, as the filesystem claims
 	 * it supports ACLs.
 	 */
 	if (error)
 		goto out;
 
 	acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id);
 	error = ufs_setacl_nfs4_internal(vp, aclp, td);
 
 out:
 	acl_free(aclp);
 	return (error);
 }
 #endif /* UFS_ACL */
 
 /*
  * Mark this file's access time for update for vfs_mark_atime().  This
  * is called from execve() and mmap().
  */
 static int
 ufs_markatime(ap)
 	struct vop_markatime_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	VI_LOCK(vp);
 	ip->i_flag |= IN_ACCESS;
 	VI_UNLOCK(vp);
 	/*
 	 * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there.
 	 */
 	return (0);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ufs_chmod(vp, mode, cred, td)
 	struct vnode *vp;
 	int mode;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.  Both of these are allowed in
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Deny setting setuid if we are not the file owner.
 	 */
 	if ((mode & ISUID) && ip->i_uid != cred->cr_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_ADMIN);
 		if (error)
 			return (error);
 	}
 
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	DIP_SET(ip, i_mode, ip->i_mode);
 	ip->i_flag |= IN_CHANGE;
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0)
 		error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td);
 #endif
 	if (error == 0 && (ip->i_flag & IN_CHANGE) != 0)
 		error = UFS_UPDATE(vp, 0);
 
 	return (error);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ufs_chown(vp, uid, gid, cred, td)
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 #ifdef QUOTA
 	int i;
 	ufs2_daddr_t change;
 #endif
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if (((uid != ip->i_uid && uid != cred->cr_uid) || 
 	    (gid != ip->i_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) != 0)
 		return (error);
 	if (ouid == uid) {
 		dqrele(vp, ip->i_dquot[USRQUOTA]);
 		ip->i_dquot[USRQUOTA] = NODQUOT;
 	}
 	if (ogid == gid) {
 		dqrele(vp, ip->i_dquot[GRPQUOTA]);
 		ip->i_dquot[GRPQUOTA] = NODQUOT;
 	}
 	change = DIP(ip, i_blocks);
 	(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 	(void) chkiq(ip, -1, cred, CHOWN|FORCE);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dqrele(vp, ip->i_dquot[i]);
 		ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	ip->i_gid = gid;
 	DIP_SET(ip, i_gid, gid);
 	ip->i_uid = uid;
 	DIP_SET(ip, i_uid, uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		if ((error = chkdq(ip, change, cred, CHOWN)) == 0) {
 			if ((error = chkiq(ip, 1, cred, CHOWN)) == 0)
 				goto good;
 			else
 				(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dqrele(vp, ip->i_dquot[i]);
 			ip->i_dquot[i] = NODQUOT;
 		}
 	}
 	ip->i_gid = ogid;
 	DIP_SET(ip, i_gid, ogid);
 	ip->i_uid = ouid;
 	DIP_SET(ip, i_uid, ouid);
 	if (getinoquota(ip) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		(void) chkdq(ip, change, cred, FORCE|CHOWN);
 		(void) chkiq(ip, 1, cred, FORCE|CHOWN);
 		(void) getinoquota(ip);
 	}
 	return (error);
 good:
 	if (getinoquota(ip))
 		panic("ufs_chown: lost quota");
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			DIP_SET(ip, i_mode, ip->i_mode);
 		}
 	}
 	error = UFS_UPDATE(vp, 0);
 	return (error);
 }
 
 static int
 ufs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 	struct thread *td;
 
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	if (ip->i_nlink <= 0)
 		vp->v_vflag |= VV_NOSYNC;
 	if ((ip->i_flags & SF_SNAPSHOT) != 0) {
 		/*
 		 * Avoid deadlock where another thread is trying to
 		 * update the inodeblock for dvp and is waiting on
 		 * snaplk.  Temporary unlock the vnode lock for the
 		 * unlinked file and sync the directory.  This should
 		 * allow vput() of the directory to not block later on
 		 * while holding the snapshot vnode locked, assuming
 		 * that the directory hasn't been unlinked too.
 		 */
 		VOP_UNLOCK(vp, 0);
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 static void
 print_bad_link_count(const char *funcname, struct vnode *dvp)
 {
 	struct inode *dip;
 
 	dip = VTOI(dvp);
 	uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n",
 	    funcname, dip->i_effnlink, (intmax_t)dip->i_number,
 	    dvp->v_mount->mnt_stat.f_mntonname);
 }
 
 /*
  * link vnode call
  */
 static int
 ufs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	struct direct newdir;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
 	if (VTOI(tdvp)->i_effnlink < 2) {
 		print_bad_link_count("ufs_link", tdvp);
 		error = EINVAL;
 		goto out;
 	}
 	ip = VTOI(vp);
 	if (ip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	/*
 	 * The file may have been removed after namei droped the original
 	 * lock.
 	 */
 	if (ip->i_effnlink == 0) {
 		error = ENOENT;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_link(VTOI(tdvp), ip);
 	error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
 		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
 	}
 
 	if (error) {
 		ip->i_effnlink--;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_link(VTOI(tdvp), ip);
 	}
 out:
 	return (error);
 }
 
 /*
  * whiteout vnode call
  */
 static int
 ufs_whiteout(ap)
 	struct vop_whiteout_args /* {
 		struct vnode *a_dvp;
 		struct componentname *a_cnp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct direct newdir;
 	int error = 0;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
 		if (dvp->v_mount->mnt_maxsymlinklen > 0)
 			return (0);
 		return (EOPNOTSUPP);
 
 	case CREATE:
 		/* create a new directory whiteout */
 #ifdef INVARIANTS
 		if ((cnp->cn_flags & SAVENAME) == 0)
 			panic("ufs_whiteout: missing name");
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		newdir.d_ino = UFS_WINO;
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
 		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
 		break;
 
 	case DELETE:
 		/* remove an existing directory whiteout */
 #ifdef INVARIANTS
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		cnp->cn_flags &= ~DOWHITEOUT;
 		error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
 		break;
 	default:
 		panic("ufs_whiteout: unknown op");
 	}
 	return (error);
 }
 
 static volatile int rename_restarts;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
     __DEVOLATILE(int *, &rename_restarts), 0,
     "Times rename had to restart due to lock contention");
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ufs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *nvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct inode *fip, *tip, *tdp, *fdp;
 	struct direct newdir;
 	off_t endoff;
 	int doingdirectory, newparent;
 	int error = 0;
 	struct mount *mp;
 	ino_t ino;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
 	endoff = 0;
 	mp = tdvp->v_mount;
 	VOP_UNLOCK(tdvp, 0);
 	if (tvp && tvp != tdvp)
 		VOP_UNLOCK(tvp, 0);
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		mp = NULL;
 		goto releout;
 	}
 relock:
 	/* 
 	 * We need to acquire 2 to 4 locks depending on whether tvp is NULL
 	 * and fdvp and tdvp are the same directory.  Subsequently we need
 	 * to double-check all paths and in the directory rename case we
 	 * need to verify that we are not creating a directory loop.  To
 	 * handle this we acquire all but fdvp using non-blocking
 	 * acquisitions.  If we fail to acquire any lock in the path we will
 	 * drop all held locks, acquire the new lock in a blocking fashion,
 	 * and then release it and restart the rename.  This acquire/release
 	 * step ensures that we do not spin on a lock waiting for release.
 	 */
 	error = vn_lock(fdvp, LK_EXCLUSIVE);
 	if (error)
 		goto releout;
 	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(fdvp, 0);
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto releout;
 		VOP_UNLOCK(tdvp, 0);
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	/*
 	 * Re-resolve fvp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		goto releout;
 	}
 	error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (error != EBUSY)
 			goto releout;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 		if (error != 0)
 			goto releout;
 		VOP_UNLOCK(nvp, 0);
 		vrele(fvp);
 		fvp = nvp;
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	vrele(fvp);
 	fvp = nvp;
 	/*
 	 * Re-resolve tvp and acquire the vnode lock if present.
 	 */
 	error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
 	if (error != 0 && error != EJUSTRETURN) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		VOP_UNLOCK(fvp, 0);
 		goto releout;
 	}
 	/*
 	 * If tvp disappeared we just carry on.
 	 */
 	if (error == EJUSTRETURN && tvp != NULL) {
 		vrele(tvp);
 		tvp = NULL;
 	}
 	/*
 	 * Get the tvp ino if the lookup succeeded.  We may have to restart
 	 * if the non-blocking acquire fails.
 	 */
 	if (error == 0) {
 		nvp = NULL;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 		if (tvp)
 			vrele(tvp);
 		tvp = nvp;
 		if (error) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			if (error != EBUSY)
 				goto releout;
 			error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 			if (error != 0)
 				goto releout;
 			vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 	}
 	fdp = VTOI(fdvp);
 	fip = VTOI(fvp);
 	tdp = VTOI(tdvp);
 	tip = NULL;
 	if (tvp)
 		tip = VTOI(tvp);
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto unlockout;
 	}
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  However, things could change after
 	 * we drop the locks above.
 	 */
 	if (fvp == tvp) {
 		error = 0;
 		goto unlockout;
 	}
 	doingdirectory = 0;
 	newparent = 0;
 	ino = fip->i_number;
 	if (fip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto unlockout;
 	}
 	if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (fdp->i_flags & APPEND)) {
 		error = EPERM;
 		goto unlockout;
 	}
 	if ((fip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    fdp == fip ||
 		    (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 		if (fdp->i_number != tdp->i_number)
 			newparent = tdp->i_number;
 		doingdirectory = 1;
 	}
 	if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) ||
 	    (tvp != NULL && tvp->v_type == VDIR &&
 	    tvp->v_mountedhere != NULL)) {
 		error = EXDEV;
 		goto unlockout;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..".
 	 */
 	if (doingdirectory && newparent) {
 		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 		if (error)
 			goto unlockout;
 		error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
 		    &ino);
 		/*
 		 * We encountered a lock that we have to wait for.  Unlock
 		 * everything else and VGET before restarting.
 		 */
 		if (ino) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			if (tvp)
 				VOP_UNLOCK(tvp, 0);
 			error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
 			if (error == 0)
 				vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 		if (error)
 			goto unlockout;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
 	}
 	if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
 	    tdp->i_effnlink == 0)
 		panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	fip->i_effnlink++;
 	fip->i_nlink++;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_setup_link(tdp, fip);
 	error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp));
 	if (error)
 		goto bad;
 
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (tip == NULL) {
 		if (ITODEV(tdp) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		if (doingdirectory && newparent) {
 			/*
 			 * Account for ".." in new directory.
 			 * When source and destination have the same
 			 * parent we don't adjust the link count.  The
 			 * actual link modification is completed when
 			 * .. is rewritten below.
 			 */
 			if (tdp->i_nlink >= UFS_LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 		}
 		ufs_makedirentry(fip, tcnp, &newdir);
 		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
 		if (error)
 			goto bad;
 		/* Setup tdvp for directory compaction if needed. */
 		if (tdp->i_count && tdp->i_endoff &&
 		    tdp->i_endoff < tdp->i_size)
 			endoff = tdp->i_endoff;
 	} else {
 		if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (tip->i_number == fip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
 		 * must possess VADMIN for the parent directory, or the
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
 		if ((tdp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((tip->i_mode & IFMT) == IFDIR) {
 			if ((tip->i_effnlink > 2) ||
 			    !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (doingdirectory) {
 			if (!newparent) {
 				tdp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
 					softdep_change_linkcnt(tdp);
 			}
 			tip->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
 				softdep_change_linkcnt(tip);
 		}
 		error = ufs_dirrewrite(tdp, tip, fip->i_number,
 		    IFTODT(fip->i_mode),
 		    (doingdirectory && newparent) ? newparent : doingdirectory);
 		if (error) {
 			if (doingdirectory) {
 				if (!newparent) {
 					tdp->i_effnlink++;
 					if (DOINGSOFTDEP(tdvp))
 						softdep_change_linkcnt(tdp);
 				}
 				tip->i_effnlink++;
 				if (DOINGSOFTDEP(tvp))
 					softdep_change_linkcnt(tip);
 			}
 		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
 			 * The only stuff left in the directory is "."
 			 * and "..". The "." reference is inconsequential
 			 * since we are quashing it. We have removed the "."
 			 * reference and the reference in the parent directory,
 			 * but there may be other hard links. The soft
 			 * dependency code will arrange to do these operations
 			 * after the parent directory entry has been deleted on
 			 * disk, so when running with that code we avoid doing
 			 * them now.
 			 */
 			if (!newparent) {
 				tdp->i_nlink--;
 				DIP_SET(tdp, i_nlink, tdp->i_nlink);
 				tdp->i_flag |= IN_CHANGE;
 			}
 			tip->i_nlink--;
 			DIP_SET(tip, i_nlink, tip->i_nlink);
 			tip->i_flag |= IN_CHANGE;
 		}
 	}
 
 	/*
 	 * 3) Unlink the source.  We have to resolve the path again to
 	 * fixup the directory offset and count for ufs_dirremove.
 	 */
 	if (fdvp == tdvp) {
 		error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 		if (error)
 			panic("ufs_rename: from entry went away!");
 		if (ino != fip->i_number)
 			panic("ufs_rename: ino mismatch %ju != %ju\n",
 			    (uintmax_t)ino, (uintmax_t)fip->i_number);
 	}
 	/*
 	 * If the source is a directory with a
 	 * new parent, the link count of the old
 	 * parent directory must be decremented
 	 * and ".." set to point to the new parent.
 	 */
 	if (doingdirectory && newparent) {
 		/*
 		 * If tip exists we simply use its link, otherwise we must
 		 * add a new one.
 		 */
 		if (tip == NULL) {
 			tdp->i_effnlink++;
 			tdp->i_nlink++;
 			DIP_SET(tdp, i_nlink, tdp->i_nlink);
 			tdp->i_flag |= IN_CHANGE;
 			if (DOINGSOFTDEP(tdvp))
 				softdep_setup_dotdot_link(tdp, fip);
 			error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) &&
 			    !DOINGASYNC(tdvp));
 			/* Don't go to bad here as the new link exists. */
 			if (error)
 				goto unlockout;
 		} else if (DOINGSUJ(tdvp))
 			/* Journal must account for each new link. */
 			softdep_setup_dotdot_link(tdp, fip);
 		fip->i_offset = mastertemplate.dot_reclen;
 		ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
 		cache_purge(fdvp);
 	}
 	error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
 	/*
 	 * The kern_renameat() looks up the fvp using the DELETE flag, which
 	 * causes the removal of the name cache entry for fvp.
 	 * As the relookup of the fvp is done in two steps:
 	 * ufs_lookup_ino() and then VFS_VGET(), another thread might do a
 	 * normal lookup of the from name just before the VFS_VGET() call,
 	 * causing the cache entry to be re-instantiated.
 	 *
 	 * The same issue also applies to tvp if it exists as
 	 * otherwise we may have a stale name cache entry for the new
 	 * name that references the old i-node if it has other links
 	 * or open file descriptors.
 	 */
 	cache_purge(fvp);
 	if (tvp)
 		cache_purge(tvp);
 	cache_purge_negative(tdvp);
 
 unlockout:
 	vput(fdvp);
 	vput(fvp);
 	if (tvp)
 		vput(tvp);
 	/*
 	 * If compaction or fsync was requested do it now that other locks
 	 * are no longer needed.
 	 */
 	if (error == 0 && endoff != 0) {
 		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
 		    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
 		if (error != 0)
 			vn_printf(tdvp,
 			    "ufs_rename: failed to truncate, error %d\n",
 			    error);
 #ifdef UFS_DIRHASH
 		else if (tdp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(tdp, endoff);
 #endif
 		/*
 		 * Even if the directory compaction failed, rename was
 		 * succesful.  Do not propagate a UFS_TRUNCATE() error
 		 * to the caller.
 		 */
 		error = 0;
 	}
 	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
 		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
 	vput(tdvp);
 	return (error);
 
 bad:
 	fip->i_effnlink--;
 	fip->i_nlink--;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_revert_link(tdp, fip);
 	goto unlockout;
 
 releout:
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp)
 		vrele(tvp);
 
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
     mode_t dmode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *dacl, *acl;
 
 	acl = acl_alloc(M_WAITOK);
 	dacl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL from parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.  If the ACL is empty, fall through to
 		 * the "not defined or available" case.
 		 */
 		if (acl->acl_cnt != 0) {
 			dmode = acl_posix1e_newfilemode(dmode, acl);
 			ip->i_mode = dmode;
 			DIP_SET(ip, i_mode, dmode);
 			*dacl = *acl;
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = dmode;
 		DIP_SET(ip, i_mode, dmode);
 		error = 0;
 		goto out;
 	
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	if (error == 0)
 		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above
 		 * was supposed to free acl.
 		 */
 		printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 		/*
 		panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()");
 		 */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 	acl_free(dacl);
 
 	return (error);
 }
 
 static int
 ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
     mode_t mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *acl;
 
 	acl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL for parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.
 		 */
 		if (acl->acl_cnt != 0) {
 			/*
 			 * Two possible ways for default ACL to not
 			 * be present.  First, the EA can be
 			 * undefined, or second, the default ACL can
 			 * be blank.  If it's blank, fall through to
 			 * the it's not defined case.
 			 */
 			mode = acl_posix1e_newfilemode(mode, acl);
 			ip->i_mode = mode;
 			DIP_SET(ip, i_mode, mode);
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = mode;
 		DIP_SET(ip, i_mode, mode);
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above was
 		 * supposed to free acl.
 		 */
 		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()\n");
 		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()"); */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 
 	return (error);
 }
 
 static int
 ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp,
     mode_t child_mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *parent_aclp, *child_aclp;
 
 	parent_aclp = acl_alloc(M_WAITOK);
 	child_aclp = acl_alloc(M_WAITOK | M_ZERO);
 
 	error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td);
 	if (error)
 		goto out;
 	acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp,
 	    child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR);
 	error = ufs_setacl_nfs4_internal(tvp, child_aclp, td);
 	if (error)
 		goto out;
 out:
 	acl_free(parent_aclp);
 	acl_free(child_aclp);
 
 	return (error);
 }
 #endif
 
 /*
  * Mkdir system call
  */
 static int
 ufs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct buf *bp;
 	struct dirtemplate dirtemplate, *dtp;
 	struct direct newdir;
 	int error, dmode;
 	long blkoff;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if (dp->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	if (dp->i_effnlink < 2) {
 		print_bad_link_count("ufs_mkdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_gid = dp->i_gid;
 	DIP_SET(ip, i_gid, dp->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 			DIP_SET(ip, i_uid, dp->i_uid);
 #ifdef QUOTA
 			if (dp->i_uid != cnp->cn_cred->cr_uid) {
 				/*
 				 * Make sure the correct user gets charged
 				 * for the space.
 				 * Make a dummy credential for the victim.
 				 * XXX This seems to never be accessed out of
 				 * our context so a stack variable is ok.
 				 */
 				refcount_init(&ucred.cr_ref, 1);
 				ucred.cr_uid = ip->i_uid;
 				ucred.cr_ngroups = 1;
 				ucred.cr_groups = &ucred_group;
 				ucred.cr_groups[0] = dp->i_gid;
 				ucp = &ucred;
 			}
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(dp, ip);
 			UFS_VFREE(tvp, ip->i_number, dmode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(dp, ip);
 		UFS_VFREE(tvp, ip->i_number, dmode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	DIP_SET(ip, i_mode, dmode);
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Bump link count in parent directory to reflect work done below.
 	 * Should be done before reference is created so cleanup is
 	 * possible if we crash.
 	 */
 	dp->i_effnlink++;
 	dp->i_nlink++;
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
 		softdep_setup_mkdir(dp, ip);
 	error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 
 	/*
 	 * Initialize directory with "." and ".." from static template.
 	 */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0)
 		dtp = &mastertemplate;
 	else
 		dtp = (struct dirtemplate *)&omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	vnode_pager_setsize(tvp, DIRBLKSIZ);
 	if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
 	    BA_CLRBUF, &bp)) != 0)
 		goto bad;
 	ip->i_size = DIRBLKSIZ;
 	DIP_SET(ip, i_size, DIRBLKSIZ);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate);
 	if (DOINGSOFTDEP(tvp)) {
 		/*
 		 * Ensure that the entire newly allocated block is a
 		 * valid directory so that future growth within the
 		 * block does not have to ensure that the block is
 		 * written before the inode.
 		 */
 		blkoff = DIRBLKSIZ;
 		while (blkoff < bp->b_bcount) {
 			((struct direct *)
 			   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
 			blkoff += DIRBLKSIZ;
 		}
 	}
 	if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) &&
 	    !DOINGASYNC(tvp))) != 0) {
 		(void)bwrite(bp);
 		goto bad;
 	}
 	/*
 	 * Directory set up, now install its entry in the parent directory.
 	 *
 	 * If we are not doing soft dependencies, then we must write out the
 	 * buffer containing the new directory body before entering the new 
 	 * name in the parent. If we are doing soft dependencies, then the
 	 * buffer containing the new directory body will be passed to and
 	 * released in the soft dependency code after the code has attached
 	 * an appropriate ordering dependency to the buffer which ensures that
 	 * the buffer is written before the new name is written in the parent.
 	 */
 	if (DOINGASYNC(dvp))
 		bdwrite(bp);
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
 	
 bad:
 	if (error == 0) {
 		*ap->a_vpp = tvp;
 	} else {
 		dp->i_effnlink--;
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
 		 */
 		ip->i_effnlink = 0;
 		ip->i_nlink = 0;
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_mkdir(dp, ip);
 
 		vput(tvp);
 	}
 out:
 	return (error);
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ufs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Do not remove a directory that is in the process of being renamed.
 	 * Verify the directory is empty (and valid). Rmdir ".." will not be
 	 * valid since ".." will contain a reference to the current directory
 	 * and thus be non-empty. Do not allow the removal of mounted on
 	 * directories (this can happen when an NFS exported filesystem
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
 	if (dp->i_effnlink <= 2) {
 		if (dp->i_effnlink == 2)
 			print_bad_link_count("ufs_rmdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	if (vp->v_mountedhere != 0) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_rmdir(dp, ip);
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_rmdir(dp, ip);
 		goto out;
 	}
 	cache_purge(dvp);
 	/*
 	 * The only stuff left in the directory is "." and "..". The "."
 	 * reference is inconsequential since we are quashing it. The soft
 	 * dependency code will arrange to do these operations after
 	 * the parent directory entry has been deleted on disk, so
 	 * when running with that code we avoid doing them now.
 	 */
 	if (!DOINGSOFTDEP(vp)) {
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(dvp, 0);
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 	}
 	cache_purge(vp);
 #ifdef UFS_DIRHASH
 	/* Kill any active hash; i_effnlink == 0, so it will not come back. */
 	if (ip->i_dirhash != NULL)
 		ufsdirhash_free(ip);
 #endif
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ufs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		const char *a_target;
 	} */ *ap;
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp, "ufs_symlink");
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, SHORTLINK(ip), len);
 		ip->i_size = len;
 		DIP_SET(ip, i_size, len);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		error = UFS_UPDATE(vp, 0);
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target),
 		    len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Vnode op for reading directories.
  */
 int
 ufs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	struct inode *ip;
 	struct direct *dp, *edp;
 	u_long *cookies;
 	struct dirent dstdp;
 	off_t offset, startoffset;
 	size_t readcnt, skipcnt;
 	ssize_t startresid;
 	u_int ncookies;
 	int error;
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip = VTOI(vp);
 	if (ip->i_effnlink == 0)
 		return (0);
 	if (ap->a_ncookies != NULL) {
 		if (uio->uio_resid < 0)
 			ncookies = 0;
 		else
 			ncookies = uio->uio_resid;
 		if (uio->uio_offset >= ip->i_size)
 			ncookies = 0;
 		else if (ip->i_size - uio->uio_offset < ncookies)
 			ncookies = ip->i_size - uio->uio_offset;
 		ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 	offset = startoffset = uio->uio_offset;
 	startresid = uio->uio_resid;
 	error = 0;
 	while (error == 0 && uio->uio_resid > 0 &&
 	    uio->uio_offset < ip->i_size) {
 		error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp);
 		if (error)
 			break;
 		if (bp->b_offset + bp->b_bcount > ip->i_size)
 			readcnt = ip->i_size - bp->b_offset;
 		else
 			readcnt = bp->b_bcount;
 		skipcnt = (size_t)(uio->uio_offset - bp->b_offset) &
 		    ~(size_t)(DIRBLKSIZ - 1);
 		offset = bp->b_offset + skipcnt;
 		dp = (struct direct *)&bp->b_data[skipcnt];
 		edp = (struct direct *)&bp->b_data[readcnt];
 		while (error == 0 && uio->uio_resid > 0 && dp < edp) {
 			if (dp->d_reclen <= offsetof(struct direct, d_name) ||
 			    (caddr_t)dp + dp->d_reclen > (caddr_t)edp) {
 				error = EIO;
 				break;
 			}
 #if BYTE_ORDER == LITTLE_ENDIAN
 			/* Old filesystem format. */
 			if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 				dstdp.d_namlen = dp->d_type;
 				dstdp.d_type = dp->d_namlen;
 			} else
 #endif
 			{
 				dstdp.d_namlen = dp->d_namlen;
 				dstdp.d_type = dp->d_type;
 			}
 			if (offsetof(struct direct, d_name) + dstdp.d_namlen >
 			    dp->d_reclen) {
 				error = EIO;
 				break;
 			}
 			if (offset < startoffset || dp->d_ino == 0)
 				goto nextentry;
 			dstdp.d_fileno = dp->d_ino;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 			/* NOTE: d_off is the offset of the *next* entry. */
 			dstdp.d_off = offset + dp->d_reclen;
 			dirent_terminate(&dstdp);
 			if (dstdp.d_reclen > uio->uio_resid) {
 				if (uio->uio_resid == startresid)
 					error = EINVAL;
 				else
 					error = EJUSTRETURN;
 				break;
 			}
 			/* Advance dp. */
 			error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio);
 			if (error)
 				break;
 			if (cookies != NULL) {
 				KASSERT(ncookies > 0,
 				    ("ufs_readdir: cookies buffer too small"));
 				*cookies = offset + dp->d_reclen;
 				cookies++;
 				ncookies--;
 			}
 nextentry:
 			offset += dp->d_reclen;
 			dp = (struct direct *)((caddr_t)dp + dp->d_reclen);
 		}
 		bqrelse(bp);
 		uio->uio_offset = offset;
 	}
 	/* We need to correct uio_offset. */
 	uio->uio_offset = offset;
 	if (error == EJUSTRETURN)
 		error = 0;
 	if (ap->a_ncookies != NULL) {
 		if (error == 0) {
 			ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (error == 0 && ap->a_eofflag)
 		*ap->a_eofflag = ip->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ufs_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	doff_t isize;
 
 	isize = ip->i_size;
 	if ((isize < vp->v_mount->mnt_maxsymlinklen) ||
 	    DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */
 		return (uiomove(SHORTLINK(ip), isize, ap->a_uio));
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ufs_bmaparray() operation may not
  * deadlock on memory.  See ufs_bmap() for details.
  */
 static int
 ufs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	ufs2_daddr_t blkno;
 	int error;
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ufs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink,
 	    ip->i_effnlink, (intmax_t)ip->i_size);
 	if (I_IS_UFS2(ip))
 		printf(", extsize %d", ip->i_din2->di_extsize);
 	printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n",
 	    (uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid,
 	    (u_int)ip->i_flags, PRINT_INODE_FLAGS);
 	printf("\tino %lu, on dev %s", (u_long)ip->i_number,
 	    devtoname(ITODEV(ip)));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ufsfifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ufs kqfilter routines if needed 
  */
 static int
 ufsfifo_kqfilter(ap)
 	struct vop_kqfilter_args *ap;
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 static int
 ufs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	int error;
 
 	error = 0;
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = UFS_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = UFS_MAXNAMLEN;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 #ifdef UFS_ACL
 	case _PC_ACL_EXTENDED:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_NFS4:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 #endif
 	case _PC_ACL_PATH_MAX:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 #else
 		*ap->a_retval = 3;
 #endif
 		break;
 #ifdef MAC
 	case _PC_MAC_PRESENT:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 #endif
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ufs_vinit(mntp, fifoops, vpp)
 	struct mount *mntp;
 	struct vop_vector *fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ASSERT_VOP_LOCKED(vp, "ufs_vinit");
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	/*
 	 * Only unallocated inodes should be of type VNON.
 	 */
 	if (ip->i_mode != 0 && vp->v_type == VNON)
 		return (EINVAL);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 	if (ip->i_number == UFS_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  * Vnode dvp must be locked.
  */
 static int
 ufs_makeinode(mode, dvp, vpp, cnp, callfunc)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 	const char *callfunc;
 {
 	struct inode *ip, *pdir;
 	struct direct newdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", callfunc);
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	if (pdir->i_effnlink < 2) {
 		print_bad_link_count(callfunc, dvp);
 		return (EINVAL);
 	}
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 	DIP_SET(ip, i_gid, pdir->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 			mode &= ~07111;
 #ifdef QUOTA
 			/*
 			 * Make sure the correct user gets charged
 			 * for the space.
 			 * Quickly knock up a dummy credential for the victim.
 			 * XXX This seems to never be accessed out of our
 			 * context so a stack variable is ok.
 			 */
 			refcount_init(&ucred.cr_ref, 1);
 			ucred.cr_uid = ip->i_uid;
 			ucred.cr_ngroups = 1;
 			ucred.cr_groups = &ucred_group;
 			ucred.cr_groups[0] = pdir->i_gid;
 			ucp = &ucred;
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(pdir, ip);
 			UFS_VFREE(tvp, ip->i_number, mode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(pdir, ip);
 		UFS_VFREE(tvp, ip->i_number, mode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	DIP_SET(ip, i_mode, mode);
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 1;
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
 		softdep_setup_create(VTOI(dvp), ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) {
 		ip->i_mode &= ~ISGID;
 		DIP_SET(ip, i_mode, ip->i_mode);
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
 	if (error)
 		goto bad;
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_effnlink = 0;
 	ip->i_nlink = 0;
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
 		softdep_revert_create(VTOI(dvp), ip);
 	vput(tvp);
 	return (error);
 }
 
 static int
 ufs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = ap->a_vp;
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 		error = vn_lock(vp, LK_SHARED);
 		if (error == 0) {
 			error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data);
 			VOP_UNLOCK(vp, 0);
 		} else
 			error = EBADF;
 		return (error);
 	case FIOSEEKHOLE:
 		return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data,
 		    ap->a_cred));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ufs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_read =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_bmap =		ufs_bmap,
 	.vop_cachedlookup =	ufs_lookup,
 	.vop_close =		ufs_close,
 	.vop_create =		ufs_create,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_ioctl =		ufs_ioctl,
 	.vop_link =		ufs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_markatime =	ufs_markatime,
 	.vop_mkdir =		ufs_mkdir,
 	.vop_mknod =		ufs_mknod,
 	.vop_need_inactive =	ufs_need_inactive,
 	.vop_open =		ufs_open,
 	.vop_pathconf =		ufs_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ufs_print,
 	.vop_readdir =		ufs_readdir,
 	.vop_readlink =		ufs_readlink,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_remove =		ufs_remove,
 	.vop_rename =		ufs_rename,
 	.vop_rmdir =		ufs_rmdir,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_strategy =		ufs_strategy,
 	.vop_symlink =		ufs_symlink,
 	.vop_whiteout =		ufs_whiteout,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
+VFS_VOP_VECTOR_REGISTER(ufs_vnodeops);
 
 struct vop_vector ufs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_close =		ufsfifo_close,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_kqfilter =		ufsfifo_kqfilter,
 	.vop_markatime =	ufs_markatime,
 	.vop_pathconf = 	ufs_pathconf,
 	.vop_print =		ufs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_write =		VOP_PANIC,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
+VFS_VOP_VECTOR_REGISTER(ufs_fifoops);