Index: vendor-sys/illumos/dist/uts/common/fs/gfs.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/gfs.c	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/fs/gfs.c	(revision 318933)
@@ -1,1178 +1,1179 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /* Portions Copyright 2007 Shivakumar GN */
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/dirent.h>
 #include <sys/kmem.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/sysmacros.h>
 #include <sys/systm.h>
 #include <sys/sunddi.h>
 #include <sys/uio.h>
 #include <sys/vmsystm.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 
 #include <vm/as.h>
 #include <vm/seg_vn.h>
 
 #include <sys/gfs.h>
 
 /*
  * Generic pseudo-filesystem routines.
  *
  * There are significant similarities between the implementation of certain file
  * system entry points across different filesystems.  While one could attempt to
  * "choke up on the bat" and incorporate common functionality into a VOP
  * preamble or postamble, such an approach is limited in the benefit it can
  * provide.  In this file we instead define a toolkit of routines which can be
  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
  * of the exercise) in a more component-like fashion.
  *
  * There are three basic classes of routines:
  *
  * 1) Lowlevel support routines
  *
  *    These routines are designed to play a support role for existing
  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
  *    without forcing the filesystem to hand over management to GFS.  The
  *    routines covered are:
  *
  *	gfs_readdir_init()
  *	gfs_readdir_emit()
  *	gfs_readdir_emitn()
  *	gfs_readdir_pred()
  *	gfs_readdir_fini()
  *	gfs_lookup_dot()
  *
  * 2) Complete GFS management
  *
  *    These routines take a more active role in management of the
  *    pseudo-filesystem.  They handle the relationship between vnode private
  *    data and VFS data, as well as the relationship between vnodes in the
  *    directory hierarchy.
  *
  *    In order to use these interfaces, the first member of every private
  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
  *    to GFS.
  *
  * 	gfs_file_create()
  * 	gfs_dir_create()
  * 	gfs_root_create()
  *
  *	gfs_file_inactive()
  *	gfs_dir_inactive()
  *	gfs_dir_lookup()
  *	gfs_dir_readdir()
  *
  * 	gfs_vop_inactive()
  * 	gfs_vop_lookup()
  * 	gfs_vop_readdir()
  * 	gfs_vop_map()
  *
  * 3) Single File pseudo-filesystems
  *
  *    This routine creates a rooted file to be overlayed ontop of another
  *    file in the physical filespace.
  *
  *    Note that the parent is NULL (actually the vfs), but there is nothing
  *    technically keeping such a file from utilizing the "Complete GFS
  *    management" set of routines.
  *
  * 	gfs_root_create_file()
  */
 
 /*
  * gfs_make_opsvec: take an array of vnode type definitions and create
  * their vnodeops_t structures
  *
  * This routine takes an array of gfs_opsvec_t's.  It could
  * alternatively take an array of gfs_opsvec_t*'s, which would allow
  * vnode types to be completely defined in files external to the caller
  * of gfs_make_opsvec().  As it stands, much more sharing takes place --
  * both the caller and the vnode type provider need to access gfsv_ops
  * and gfsv_template, and the caller also needs to know gfsv_name.
  */
 int
 gfs_make_opsvec(gfs_opsvec_t *vec)
 {
 	int error, i;
 
 	for (i = 0; ; i++) {
 		if (vec[i].gfsv_name == NULL)
 			return (0);
 		error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
 		    vec[i].gfsv_ops);
 		if (error)
 			break;
 	}
 
 	cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
 	    vec[i].gfsv_name);
 	for (i--; i >= 0; i--) {
 		vn_freevnodeops(*vec[i].gfsv_ops);
 		*vec[i].gfsv_ops = NULL;
 	}
 	return (error);
 }
 
 /*
  * Low level directory routines
  *
  * These routines provide some simple abstractions for reading directories.
  * They are designed to be used by existing pseudo filesystems (namely procfs)
  * that already have a complicated management infrastructure.
  */
 
 /*
  * gfs_get_parent_ino: used to obtain a parent inode number and the
  * inode number of the given vnode in preparation for calling gfs_readdir_init.
  */
 int
 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
     ino64_t *pino, ino64_t *ino)
 {
 	vnode_t *parent;
 	gfs_dir_t *dp = dvp->v_data;
 	int error;
 
 	*ino = dp->gfsd_file.gfs_ino;
 	parent = dp->gfsd_file.gfs_parent;
 
 	if (parent == NULL) {
 		*pino = *ino;		/* root of filesystem */
 	} else if (dvp->v_flag & V_XATTRDIR) {
 		vattr_t va;
 
 		va.va_mask = AT_NODEID;
 		error = VOP_GETATTR(parent, &va, 0, cr, ct);
 		if (error)
 			return (error);
 		*pino = va.va_nodeid;
 	} else {
 		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
 	}
 
 	return (0);
 }
 
 /*
  * gfs_readdir_init: initiate a generic readdir
  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
  *   name_max	- the directory's maximum file name length
  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
  *   uiop	- the uiop passed to readdir
  *   parent	- the parent directory's inode
  *   self	- this directory's inode
  *   flags	- flags from VOP_READDIR
  *
  * Returns 0 or a non-zero errno.
  *
  * Typical VOP_READDIR usage of gfs_readdir_*:
  *
  *	if ((error = gfs_readdir_init(...)) != 0)
  *		return (error);
  *	eof = 0;
  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
  *		if (!consumer_entry_at(voffset))
  *			voffset = consumer_next_entry(voffset);
  *		if (consumer_eof(voffset)) {
  *			eof = 1
  *			break;
  *		}
  *		if ((error = gfs_readdir_emit(..., voffset,
  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
  *			break;
  *	}
  *	return (gfs_readdir_fini(..., error, eofp, eof));
  *
  * As you can see, a zero result from gfs_readdir_pred() or
  * gfs_readdir_emit() indicates that processing should continue,
  * whereas a non-zero result indicates that the loop should terminate.
  * Most consumers need do nothing more than let gfs_readdir_fini()
  * determine what the cause of failure was and return the appropriate
  * value.
  */
 int
 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
     uio_t *uiop, ino64_t parent, ino64_t self, int flags)
 {
 	size_t dirent_size;
 
 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
 	    (uiop->uio_loffset % ureclen) != 0)
 		return (EINVAL);
 
 	st->grd_ureclen = ureclen;
 	st->grd_oresid = uiop->uio_resid;
 	st->grd_namlen = name_max;
 	if (flags & V_RDDIR_ENTFLAGS)
 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
 	else
 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
 	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
 	st->grd_parent = parent;
 	st->grd_self = self;
 	st->grd_flags = flags;
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit_int: internal routine to emit directory entry
  *
  *   st		- the current readdir state, which must have d_ino/ed_ino
  *		  and d_name/ed_name set
  *   uiop	- caller-supplied uio pointer
  *   next	- the offset of the next entry
  */
 static int
 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
 {
 	int reclen;
 	dirent64_t *dp;
 	edirent_t *edp;
 
 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
 		edp = st->grd_dirent;
 		reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
 	} else {
 		dp = st->grd_dirent;
 		reclen = DIRENT64_RECLEN(strlen(dp->d_name));
 	}
 
 	if (reclen > uiop->uio_resid) {
 		/*
 		 * Error if no entries were returned yet
 		 */
 		if (uiop->uio_resid == st->grd_oresid)
 			return (EINVAL);
 		return (-1);
 	}
 
 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
 		edp->ed_off = next;
 		edp->ed_reclen = (ushort_t)reclen;
 	} else {
 		dp->d_off = next;
 		dp->d_reclen = (ushort_t)reclen;
 	}
 
 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
 		return (EFAULT);
 
 	uiop->uio_loffset = next;
 
 	return (0);
 }
 
 /*
  * gfs_readdir_emit: emit a directory entry
  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
  *   ino        - the entry's inode
  *   name       - the entry's name
  *   eflags	- value for ed_eflags (if processing edirent_t)
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
     ino64_t ino, const char *name, int eflags)
 {
 	offset_t off = (voff + 2) * st->grd_ureclen;
 
 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
 		edirent_t *edp = st->grd_dirent;
 
 		edp->ed_ino = ino;
 		(void) strncpy(edp->ed_name, name, st->grd_namlen);
 		edp->ed_eflags = eflags;
 	} else {
 		dirent64_t *dp = st->grd_dirent;
 
 		dp->d_ino = ino;
 		(void) strncpy(dp->d_name, name, st->grd_namlen);
 	}
 
 	/*
 	 * Inter-entry offsets are invalid, so we assume a record size of
 	 * grd_ureclen and explicitly set the offset appropriately.
 	 */
 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
 }
 
 /*
  * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
  * instead of a string for the entry's name.
  */
 int
 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
     ino64_t ino, unsigned long num)
 {
 	char buf[40];
 
 	numtos(num, buf);
 	return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
 }
 
 /*
  * gfs_readdir_pred: readdir loop predicate
  *   voffp - a pointer in which the next virtual offset should be stored
  *
  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
  * readdir loop should terminate.  A non-zero result (either errno or
  * -1) from this function is typically passed directly to
  * gfs_readdir_fini().
  */
 int
 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
 {
 	offset_t off, voff;
 	int error;
 
 top:
 	if (uiop->uio_resid <= 0)
 		return (-1);
 
 	off = uiop->uio_loffset / st->grd_ureclen;
 	voff = off - 2;
 	if (off == 0) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
 		    ".", 0)) == 0)
 			goto top;
 	} else if (off == 1) {
 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
 		    "..", 0)) == 0)
 			goto top;
 	} else {
 		*voffp = voff;
 		return (0);
 	}
 
 	return (error);
 }
 
 /*
  * gfs_readdir_fini: generic readdir cleanup
  *   error	- if positive, an error to return
  *   eofp	- the eofp passed to readdir
  *   eof	- the eof value
  *
  * Returns a 0 on success, a non-zero errno on failure.  This result
  * should be returned from readdir.
  */
 int
 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
 {
 	size_t dirent_size;
 
 	if (st->grd_flags & V_RDDIR_ENTFLAGS)
 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
 	else
 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
 	kmem_free(st->grd_dirent, dirent_size);
 	if (error > 0)
 		return (error);
 	if (eofp)
 		*eofp = eof;
 	return (0);
 }
 
 /*
  * gfs_lookup_dot
  *
  * Performs a basic check for "." and ".." directory entries.
  */
 int
 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
 {
 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
 		VN_HOLD(dvp);
 		*vpp = dvp;
 		return (0);
 	} else if (strcmp(nm, "..") == 0) {
 		if (pvp == NULL) {
 			ASSERT(dvp->v_flag & VROOT);
 			VN_HOLD(dvp);
 			*vpp = dvp;
 		} else {
 			VN_HOLD(pvp);
 			*vpp = pvp;
 		}
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * gfs_file_create(): create a new GFS file
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *
  * In order to use this interface, the parent vnode must have been created by
  * gfs_dir_create(), and the private data stored in v_data must have a
  * 'gfs_file_t' as its first field.
  *
  * Given these constraints, this routine will automatically:
  *
  * 	- Allocate v_data for the vnode
  * 	- Initialize necessary fields in the vnode
  * 	- Hold the parent
  */
 vnode_t *
 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
 {
 	gfs_file_t *fp;
 	vnode_t *vp;
 
 	/*
 	 * Allocate vnode and internal data structure
 	 */
 	fp = kmem_zalloc(size, KM_SLEEP);
 	vp = vn_alloc(KM_SLEEP);
 
 	/*
 	 * Set up various pointers
 	 */
 	fp->gfs_vnode = vp;
 	fp->gfs_parent = pvp;
 	vp->v_data = fp;
 	fp->gfs_size = size;
 	fp->gfs_type = GFS_FILE;
 
 	/*
 	 * Initialize vnode and hold parent.
 	 */
 	vn_setops(vp, ops);
 	if (pvp) {
 		VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
 		VN_HOLD(pvp);
 	}
 
 	return (vp);
 }
 
 /*
  * gfs_dir_create: creates a new directory in the parent
  *
  *   size	- size of private data structure (v_data)
  *   pvp	- parent vnode (GFS directory)
  *   ops	- vnode operations vector
  *   entries	- NULL-terminated list of static entries (if any)
  *   maxlen	- maximum length of a directory entry
  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
  *   inode_cb	- inode callback (see gfs_dir_readdir)
  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
  *
  * In order to use this function, the first member of the private vnode
  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
  * static entries, defined when the structure is initialized, and dynamic
  * entries, retrieved through callbacks.
  *
  * If a directory has static entries, then it must supply a inode callback,
  * which will compute the inode number based on the parent and the index.
  * For a directory with dynamic entries, the caller must supply a readdir
  * callback and a lookup callback.  If a static lookup fails, we fall back to
  * the supplied lookup callback, if any.
  *
  * This function also performs the same initialization as gfs_file_create().
  */
 vnode_t *
 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp;
 	gfs_dir_t *dp;
 	gfs_dirent_t *de;
 
 	vp = gfs_file_create(struct_size, pvp, ops);
 	vp->v_type = VDIR;
 
 	dp = vp->v_data;
 	dp->gfsd_file.gfs_type = GFS_DIR;
 	dp->gfsd_maxlen = maxlen;
 
 	if (entries != NULL) {
 		for (de = entries; de->gfse_name != NULL; de++)
 			dp->gfsd_nstatic++;
 
 		dp->gfsd_static = kmem_alloc(
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
 		bcopy(entries, dp->gfsd_static,
 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	dp->gfsd_readdir = readdir_cb;
 	dp->gfsd_lookup = lookup_cb;
 	dp->gfsd_inode = inode_cb;
 
 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (vp);
 }
 
 /*
  * gfs_root_create(): create a root vnode for a GFS filesystem
  *
  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
  */
 vnode_t *
 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
 {
 	vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
 	    maxlen, readdir_cb, lookup_cb);
 
 	/* Manually set the inode */
 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
 
 	VFS_HOLD(vfsp);
 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
 
 	return (vp);
 }
 
 /*
  * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
  *
  * Similar to gfs_root_create(), this creates a root vnode for a file to
  * be the pseudo-filesystem.
  */
 vnode_t *
 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
 {
 	vnode_t	*vp = gfs_file_create(size, NULL, ops);
 
 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
 
 	VFS_HOLD(vfsp);
 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
 
 	return (vp);
 }
 
 /*
  * gfs_file_inactive()
  *
  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
  * remove the given vnode from the parent directory and clean up any references
  * in the VFS layer.
  *
  * If the vnode was not removed (due to a race with vget), then NULL is
  * returned.  Otherwise, a pointer to the private data is returned.
  */
 void *
 gfs_file_inactive(vnode_t *vp)
 {
 	int i;
 	gfs_dirent_t *ge = NULL;
 	gfs_file_t *fp = vp->v_data;
 	gfs_dir_t *dp = NULL;
 	void *data;
 
 	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
 		goto found;
 
 	dp = fp->gfs_parent->v_data;
 
 	/*
 	 * First, see if this vnode is cached in the parent.
 	 */
 	gfs_dir_lock(dp);
 
 	/*
 	 * Find it in the set of static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
 		ge = &dp->gfsd_static[i];
 
 		if (ge->gfse_vnode == vp)
 			goto found;
 	}
 
 	/*
 	 * If 'ge' is NULL, then it is a dynamic entry.
 	 */
 	ge = NULL;
 
 found:
 	if (vp->v_flag & V_XATTRDIR) {
 		mutex_enter(&fp->gfs_parent->v_lock);
 	}
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count == 1) {
 		/*
 		 * Really remove this vnode
 		 */
 		data = vp->v_data;
 		if (ge != NULL) {
 			/*
 			 * If this was a statically cached entry, simply set the
 			 * cached vnode to NULL.
 			 */
 			ge->gfse_vnode = NULL;
 		}
 		if (vp->v_flag & V_XATTRDIR) {
 			fp->gfs_parent->v_xattrdir = NULL;
 			mutex_exit(&fp->gfs_parent->v_lock);
 		}
 		mutex_exit(&vp->v_lock);
 
 		/*
 		 * Free vnode and release parent
 		 */
 		if (fp->gfs_parent) {
 			if (dp) {
 				gfs_dir_unlock(dp);
 			}
 			VN_RELE(fp->gfs_parent);
 		} else {
 			ASSERT(vp->v_vfsp != NULL);
 			VFS_RELE(vp->v_vfsp);
 		}
 		vn_free(vp);
 	} else {
-		vp->v_count--;
+		VN_RELE_LOCKED(vp);
 		data = NULL;
 		mutex_exit(&vp->v_lock);
 		if (vp->v_flag & V_XATTRDIR) {
 			mutex_exit(&fp->gfs_parent->v_lock);
 		}
 		if (dp)
 			gfs_dir_unlock(dp);
 	}
 
 	return (data);
 }
 
 /*
  * gfs_dir_inactive()
  *
  * Same as above, but for directories.
  */
 void *
 gfs_dir_inactive(vnode_t *vp)
 {
 	gfs_dir_t *dp;
 
 	ASSERT(vp->v_type == VDIR);
 
 	if ((dp = gfs_file_inactive(vp)) != NULL) {
 		mutex_destroy(&dp->gfsd_lock);
 		if (dp->gfsd_nstatic)
 			kmem_free(dp->gfsd_static,
 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
 	}
 
 	return (dp);
 }
 
 /*
  * gfs_dir_lookup_dynamic()
  *
  * This routine looks up the provided name amongst the dynamic entries
  * in the gfs directory and returns the corresponding vnode, if found.
  *
  * The gfs directory is expected to be locked by the caller prior to
  * calling this function.  The directory will be unlocked during the
  * execution of this function, but will be locked upon return from the
  * function.  This function returns 0 on success, non-zero on error.
  *
  * The dynamic lookups are performed by invoking the lookup
  * callback, which is passed to this function as the first argument.
  * The arguments to the callback are:
  *
  * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
  *     int flags, int *deflgs, pathname_t *rpnp);
  *
  *	pvp	- parent vnode
  *	nm	- name of entry
  *	vpp	- pointer to resulting vnode
  *	cr	- pointer to cred
  *	flags	- flags value from lookup request
  *		ignored here; currently only used to request
  *		insensitive lookups
  *	direntflgs - output parameter, directory entry flags
  *		ignored here; currently only used to indicate a lookup
  *		has more than one possible match when case is not considered
  *	realpnp	- output parameter, real pathname
  *		ignored here; when lookup was performed case-insensitively,
  *		this field contains the "real" name of the file.
  *
  * 	Returns 0 on success, non-zero on error.
  */
 static int
 gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
     const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
     int *direntflags, pathname_t *realpnp)
 {
 	gfs_file_t *fp;
 	ino64_t ino;
 	int ret;
 
 	ASSERT(GFS_DIR_LOCKED(dp));
 
 	/*
 	 * Drop the directory lock, as the lookup routine
 	 * will need to allocate memory, or otherwise deadlock on this
 	 * directory.
 	 */
 	gfs_dir_unlock(dp);
 	ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
 	gfs_dir_lock(dp);
 
 	/*
 	 * The callback for extended attributes returns a vnode
 	 * with v_data from an underlying fs.
 	 */
 	if (ret == 0 && !IS_XATTRDIR(dvp)) {
 		fp = (gfs_file_t *)((*vpp)->v_data);
 		fp->gfs_index = -1;
 		fp->gfs_ino = ino;
 	}
 
 	return (ret);
 }
 
 /*
  * gfs_dir_lookup_static()
  *
  * This routine looks up the provided name amongst the static entries
  * in the gfs directory and returns the corresponding vnode, if found.
  * The first argument to the function is a pointer to the comparison
  * function this function should use to decide if names are a match.
  *
  * If a match is found, and GFS_CACHE_VNODE is set and the vnode
  * exists, we simply return the existing vnode.  Otherwise, we call
  * the static entry's callback routine, caching the result if
  * necessary.  If the idx pointer argument is non-NULL, we use it to
  * return the index of the matching static entry.
  *
  * The gfs directory is expected to be locked by the caller prior to calling
  * this function.  The directory may be unlocked during the execution of
  * this function, but will be locked upon return from the function.
  *
  * This function returns 0 if a match is found, ENOENT if not.
  */
 static int
 gfs_dir_lookup_static(int (*compare)(const char *, const char *),
     gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
     vnode_t **vpp, pathname_t *rpnp)
 {
 	gfs_dirent_t *ge;
 	vnode_t *vp = NULL;
 	int i;
 
 	ASSERT(GFS_DIR_LOCKED(dp));
 
 	/*
 	 * Search static entries.
 	 */
 	for (i = 0; i < dp->gfsd_nstatic; i++) {
 		ge = &dp->gfsd_static[i];
 
 		if (compare(ge->gfse_name, nm) == 0) {
 			if (rpnp)
 				(void) strlcpy(rpnp->pn_buf, ge->gfse_name,
 				    rpnp->pn_bufsize);
 
 			if (ge->gfse_vnode) {
 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
 				vp = ge->gfse_vnode;
 				VN_HOLD(vp);
 				break;
 			}
 
 			/*
 			 * We drop the directory lock, as the constructor will
 			 * need to do KM_SLEEP allocations.  If we return from
 			 * the constructor only to find that a parallel
 			 * operation has completed, and GFS_CACHE_VNODE is set
 			 * for this entry, we discard the result in favor of
 			 * the cached vnode.
 			 */
 			gfs_dir_unlock(dp);
 			vp = ge->gfse_ctor(dvp);
 			gfs_dir_lock(dp);
 
 			((gfs_file_t *)vp->v_data)->gfs_index = i;
 
 			/* Set the inode according to the callback. */
 			((gfs_file_t *)vp->v_data)->gfs_ino =
 			    dp->gfsd_inode(dvp, i);
 
 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
 				if (ge->gfse_vnode == NULL) {
 					ge->gfse_vnode = vp;
 				} else {
 					/*
 					 * A parallel constructor beat us to it;
 					 * return existing vnode.  We have to be
 					 * careful because we can't release the
 					 * current vnode while holding the
 					 * directory lock; its inactive routine
 					 * will try to lock this directory.
 					 */
 					vnode_t *oldvp = vp;
 					vp = ge->gfse_vnode;
 					VN_HOLD(vp);
 
 					gfs_dir_unlock(dp);
 					VN_RELE(oldvp);
 					gfs_dir_lock(dp);
 				}
 			}
 			break;
 		}
 	}
 
 	if (vp == NULL)
 		return (ENOENT);
 	else if (idx)
 		*idx = i;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * gfs_dir_lookup()
  *
  * Looks up the given name in the directory and returns the corresponding
  * vnode, if found.
  *
  * First, we search statically defined entries, if any, with a call to
  * gfs_dir_lookup_static().  If no static entry is found, and we have
  * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
  *
  * This function returns 0 on success, non-zero on error.
  */
 int
 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
     int flags, int *direntflags, pathname_t *realpnp)
 {
 	gfs_dir_t *dp = dvp->v_data;
 	boolean_t casecheck;
 	vnode_t *dynvp = NULL;
 	vnode_t *vp = NULL;
 	int (*compare)(const char *, const char *);
 	int error, idx;
 
 	ASSERT(dvp->v_type == VDIR);
 
 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
 		return (0);
 
 	casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
 	if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
 	    (flags & FIGNORECASE))
 		compare = strcasecmp;
 	else
 		compare = strcmp;
 
 	gfs_dir_lock(dp);
 
 	error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
 
 	if (vp && casecheck) {
 		gfs_dirent_t *ge;
 		int i;
 
 		for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
 			ge = &dp->gfsd_static[i];
 
 			if (strcasecmp(ge->gfse_name, nm) == 0) {
 				*direntflags |= ED_CASE_CONFLICT;
 				goto out;
 			}
 		}
 	}
 
 	if ((error || casecheck) && dp->gfsd_lookup)
 		error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
 		    &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
 
 	if (vp && dynvp) {
 		/* static and dynamic entries are case-insensitive conflict */
 		ASSERT(casecheck);
 		*direntflags |= ED_CASE_CONFLICT;
 		VN_RELE(dynvp);
 	} else if (vp == NULL) {
 		vp = dynvp;
 	} else if (error == ENOENT) {
 		error = 0;
 	} else if (error) {
 		VN_RELE(vp);
 		vp = NULL;
 	}
 
 out:
 	gfs_dir_unlock(dp);
 
 	*vpp = vp;
 	return (error);
 }
 
 /*
  * gfs_dir_readdir: does a readdir() on the given directory
  *
  *    dvp	- directory vnode
  *    uiop	- uio structure
  *    eofp	- eof pointer
  *    data	- arbitrary data passed to readdir callback
  *
  * This routine does all the readdir() dirty work.  Even so, the caller must
  * supply two callbacks in order to get full compatibility.
  *
  * If the directory contains static entries, an inode callback must be
  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
  * when reading the directory.  This function has the following arguments:
  *
  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
  *
  * 	vp	- vnode for the directory
  * 	index	- index in original gfs_dirent_t array
  *
  * 	Returns the inode number for the given entry.
  *
  * For directories with dynamic entries, a readdir callback must be provided.
  * This is significantly more complex, thanks to the particulars of
  * VOP_READDIR().
  *
  *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
  *	    offset_t *off, offset_t *nextoff, void *data, int flags)
  *
  *	vp	- directory vnode
  *	dp	- directory entry, sized according to maxlen given to
  *		  gfs_dir_create().  callback must fill in d_name and
  *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
  *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
  *		  is set in 'flags'.
  *	eofp	- callback must set to 1 when EOF has been reached
  *	off	- on entry, the last offset read from the directory.  Callback
  *		  must set to the offset of the current entry, typically left
  *		  untouched.
  *	nextoff	- callback must set to offset of next entry.  Typically
  *		  (off + 1)
  *	data	- caller-supplied data
  *	flags	- VOP_READDIR flags
  *
  *	Return 0 on success, or error on failure.
  */
 int
 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	gfs_readdir_state_t gstate;
 	int error, eof = 0;
 	ino64_t ino, pino;
 	offset_t off, next;
 	gfs_dir_t *dp = dvp->v_data;
 
 	error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
 	if (error)
 		return (error);
 
 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
 	    pino, ino, flags)) != 0)
 		return (error);
 
 	while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
 	    !eof) {
 
 		if (off >= 0 && off < dp->gfsd_nstatic) {
 			ino = dp->gfsd_inode(dvp, off);
 
 			if ((error = gfs_readdir_emit(&gstate, uiop,
 			    off, ino, dp->gfsd_static[off].gfse_name, 0))
 			    != 0)
 				break;
 
 		} else if (dp->gfsd_readdir) {
 			off -= dp->gfsd_nstatic;
 
 			if ((error = dp->gfsd_readdir(dvp,
 			    gstate.grd_dirent, &eof, &off, &next,
 			    data, flags)) != 0 || eof)
 				break;
 
 			off += dp->gfsd_nstatic + 2;
 			next += dp->gfsd_nstatic + 2;
 
 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
 			    next)) != 0)
 				break;
 		} else {
 			/*
 			 * Offset is beyond the end of the static entries, and
 			 * we have no dynamic entries.  Set EOF.
 			 */
 			eof = 1;
 		}
 	}
 
 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
 }
 
 
 /*
  * gfs_vop_lookup: VOP_LOOKUP() entry point
  *
  * For use directly in vnode ops table.  Given a GFS directory, calls
  * gfs_dir_lookup() as necessary.
  */
 /* ARGSUSED */
 int
 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
 }
 
 /*
  * gfs_vop_readdir: VOP_READDIR() entry point
  *
  * For use directly in vnode ops table.  Given a GFS directory, calls
  * gfs_dir_readdir() as necessary.
  */
 /* ARGSUSED */
 int
 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
     caller_context_t *ct, int flags)
 {
 	return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
 }
 
 
 /*
  * gfs_vop_map: VOP_MAP() entry point
  *
  * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
  * This function only works for readonly files, and uses the read function for
  * the vnode to fill in the data.  The mapped data is immediately faulted in and
  * filled with the necessary data during this call; there are no getpage() or
  * putpage() routines.
  */
 /* ARGSUSED */
 int
 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
     caller_context_t *ct)
 {
 	int rv;
 	ssize_t resid = len;
 
 	/*
 	 * Check for bad parameters
 	 */
 #ifdef _ILP32
 	if (len > MAXOFF_T)
 		return (ENOMEM);
 #endif
 	if (vp->v_flag & VNOMAP)
 		return (ENOTSUP);
 	if (off > MAXOFF_T)
 		return (EFBIG);
 	if ((long)off < 0 || (long)(off + len) < 0)
 		return (EINVAL);
 	if (vp->v_type != VREG)
 		return (ENODEV);
 	if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
 		return (EACCES);
 
 	/*
 	 * Find appropriate address if needed, otherwise clear address range.
 	 */
 	as_rangelock(as);
 	rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
 	if (rv != 0) {
 		as_rangeunlock(as);
 		return (rv);
 	}
 
 	/*
 	 * Create mapping
 	 */
 	rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
 	as_rangeunlock(as);
 	if (rv != 0)
 		return (rv);
 
 	/*
 	 * Fill with data from read()
 	 */
 	rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
 	    0, (rlim64_t)0, cred, &resid);
 
 	if (rv == 0 && resid != 0)
 		rv = ENXIO;
 
 	if (rv != 0) {
 		as_rangelock(as);
 		(void) as_unmap(as, *addrp, len);
 		as_rangeunlock(as);
 	}
 
 	return (rv);
 }
 
 /*
  * gfs_vop_inactive: VOP_INACTIVE() entry point
  *
  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
  */
 /* ARGSUSED */
 void
 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	gfs_file_t *fp = vp->v_data;
 	void *data;
 
 	if (fp->gfs_type == GFS_DIR)
 		data = gfs_dir_inactive(vp);
 	else
 		data = gfs_file_inactive(vp);
 
 	if (data != NULL)
 		kmem_free(data, fp->gfs_size);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/vnode.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/vnode.c	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/fs/vnode.c	(revision 318933)
@@ -1,4577 +1,4579 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
  * The Regents of the University of California
  * All Rights Reserved
  *
  * University Acknowledgment- Portions of this document are derived from
  * software developed by the University of California, Berkeley, and its
  * contributors.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/t_lock.h>
 #include <sys/errno.h>
 #include <sys/cred.h>
 #include <sys/user.h>
 #include <sys/uio.h>
 #include <sys/file.h>
 #include <sys/pathname.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/vnode.h>
 #include <sys/rwstlock.h>
 #include <sys/fem.h>
 #include <sys/stat.h>
 #include <sys/mode.h>
 #include <sys/conf.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/systm.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
 #include <c2/audit.h>
 #include <sys/acl.h>
 #include <sys/nbmlock.h>
 #include <sys/fcntl.h>
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
 
 /* Tunable via /etc/system; used only by admin/install */
 int nfs_global_client_only;
 
 /*
  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  * the same fstype index that is used to index into the vfssw table.
  */
 vopstats_t **vopstats_fstype;
 
 /* vopstats initialization template used for fast initialization via bcopy() */
 static vopstats_t *vs_templatep;
 
 /* Kmem cache handle for vsk_anchor_t allocations */
 kmem_cache_t *vsk_anchor_cache;
 
 /* file events cleanup routine */
 extern void free_fopdata(vnode_t *);
 
 /*
  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  * updates to vsktat_tree.
  */
 avl_tree_t	vskstat_tree;
 kmutex_t	vskstat_tree_lock;
 
 /* Global variable which enables/disables the vopstats collection */
 int vopstats_enabled = 1;
 
 /*
  * forward declarations for internal vnode specific data (vsd)
  */
 static void *vsd_realloc(void *, size_t, size_t);
 
 /*
  * forward declarations for reparse point functions
  */
 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 
 /*
  * VSD -- VNODE SPECIFIC DATA
  * The v_data pointer is typically used by a file system to store a
  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
  * However, there are times when additional project private data needs
  * to be stored separately from the data (node) pointed to by v_data.
  * This additional data could be stored by the file system itself or
  * by a completely different kernel entity.  VSD provides a way for
  * callers to obtain a key and store a pointer to private data associated
  * with a vnode.
  *
  * Callers are responsible for protecting the vsd by holding v_vsd_lock
  * for calls to vsd_set() and vsd_get().
  */
 
 /*
  * vsd_lock protects:
  *   vsd_nkeys - creation and deletion of vsd keys
  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
  *   vsd_destructor - adding and removing destructors to the list
  */
 static kmutex_t		vsd_lock;
 static uint_t		vsd_nkeys;	 /* size of destructor array */
 /* list of vsd_node's */
 static list_t *vsd_list = NULL;
 /* per-key destructor funcs */
 static void 		(**vsd_destructor)(void *);
 
 /*
  * The following is the common set of actions needed to update the
  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
  * recording of the bytes transferred.  Since the code is similar
  * but small, it is nearly a duplicate.  Consequently any changes
  * to one may need to be reflected in the other.
  * Rundown of the variables:
  * vp - Pointer to the vnode
  * counter - Partial name structure member to update in vopstats for counts
  * bytecounter - Partial name structure member to update in vopstats for bytes
  * bytesval - Value to update in vopstats for bytes
  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
  */
 
 #define	VOPSTATS_UPDATE(vp, counter) {					\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp && vfsp->vfs_implp &&					\
 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 		    size_t, uint64_t *);				\
 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
 		(*stataddr)++;						\
 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
 			vsp->n##counter.value.ui64++;			\
 		}							\
 	}								\
 }
 
 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp && vfsp->vfs_implp &&					\
 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 		    size_t, uint64_t *);				\
 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 		(*stataddr)++;						\
 		vsp->bytecounter.value.ui64 += bytesval;		\
 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
 			vsp->n##counter.value.ui64++;			\
 			vsp->bytecounter.value.ui64 += bytesval;	\
 		}							\
 	}								\
 }
 
 /*
  * If the filesystem does not support XIDs map credential
  * If the vfsp is NULL, perhaps we should also map?
  */
 #define	VOPXID_MAP_CR(vp, cr)	{					\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
 		cr = crgetmapped(cr);					\
 	}
 
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
  */
 enum vtype iftovt_tab[] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 
 ushort_t vttoif_tab[] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 };
 
 /*
  * The system vnode cache.
  */
 
 kmem_cache_t *vn_cache;
 
 
 /*
  * Vnode operations vector.
  */
 
 static const fs_operation_trans_def_t vn_ops_table[] = {
 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 	    fs_setfl, fs_nosys,
 
 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 	    fs_rwlock, fs_rwlock,
 
 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 	    (fs_generic_func_p) fs_rwunlock,
 	    (fs_generic_func_p) fs_rwunlock,	/* no errors allowed */
 
 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 	    fs_cmp, fs_cmp,		/* no errors allowed */
 
 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 	    fs_frlock, fs_nosys,
 
 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 	    (fs_generic_func_p) fs_nosys_map,
 	    (fs_generic_func_p) fs_nosys_map,
 
 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 	    (fs_generic_func_p) fs_nosys_addmap,
 	    (fs_generic_func_p) fs_nosys_addmap,
 
 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 
 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 	    fs_pathconf, fs_nosys,
 
 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 	    (fs_generic_func_p) fs_dispose,
 	    (fs_generic_func_p) fs_nodispose,
 
 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 	    fs_fab_acl, fs_nosys,
 
 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 	    fs_shrlock, fs_nosys,
 
 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 
 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 	    fs_nosys, fs_nosys,
 
 	NULL, 0, NULL, NULL
 };
 
 /* Extensible attribute (xva) routines. */
 
 /*
  * Zero out the structure, set the size of the requested/returned bitmaps,
  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
  * to the returned attributes array.
  */
 void
 xva_init(xvattr_t *xvap)
 {
 	bzero(xvap, sizeof (xvattr_t));
 	xvap->xva_mapsize = XVA_MAPSIZE;
 	xvap->xva_magic = XVA_MAGIC;
 	xvap->xva_vattr.va_mask = AT_XVATTR;
 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 }
 
 /*
  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
  * structure.  Otherwise, returns NULL.
  */
 xoptattr_t *
 xva_getxoptattr(xvattr_t *xvap)
 {
 	xoptattr_t *xoap = NULL;
 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
 		xoap = &xvap->xva_xoptattrs;
 	return (xoap);
 }
 
 /*
  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
  * kstat name.
  */
 static int
 vska_compar(const void *n1, const void *n2)
 {
 	int ret;
 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 
 	if (p1 < p2) {
 		ret = -1;
 	} else if (p1 > p2) {
 		ret = 1;
 	} else {
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Used to create a single template which will be bcopy()ed to a newly
  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
  */
 static vopstats_t *
 create_vopstats_template()
 {
 	vopstats_t		*vsp;
 
 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
 
 	/* VOP_OPEN */
 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 	/* VOP_CLOSE */
 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 	/* VOP_READ I/O */
 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 	/* VOP_WRITE I/O */
 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 	/* VOP_IOCTL */
 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 	/* VOP_SETFL */
 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 	/* VOP_GETATTR */
 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 	/* VOP_SETATTR */
 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 	/* VOP_ACCESS */
 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 	/* VOP_LOOKUP */
 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 	/* VOP_CREATE */
 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 	/* VOP_REMOVE */
 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 	/* VOP_LINK */
 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 	/* VOP_RENAME */
 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 	/* VOP_MKDIR */
 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 	/* VOP_RMDIR */
 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 	/* VOP_READDIR I/O */
 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 	    KSTAT_DATA_UINT64);
 	/* VOP_SYMLINK */
 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 	/* VOP_READLINK */
 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 	/* VOP_FSYNC */
 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 	/* VOP_INACTIVE */
 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 	/* VOP_FID */
 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 	/* VOP_RWLOCK */
 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 	/* VOP_RWUNLOCK */
 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 	/* VOP_SEEK */
 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 	/* VOP_CMP */
 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 	/* VOP_FRLOCK */
 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 	/* VOP_SPACE */
 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 	/* VOP_REALVP */
 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 	/* VOP_GETPAGE */
 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 	/* VOP_PUTPAGE */
 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 	/* VOP_MAP */
 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 	/* VOP_ADDMAP */
 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 	/* VOP_DELMAP */
 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 	/* VOP_POLL */
 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 	/* VOP_DUMP */
 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 	/* VOP_PATHCONF */
 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 	/* VOP_PAGEIO */
 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 	/* VOP_DUMPCTL */
 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 	/* VOP_DISPOSE */
 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 	/* VOP_SETSECATTR */
 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 	/* VOP_GETSECATTR */
 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 	/* VOP_SHRLOCK */
 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 	/* VOP_VNEVENT */
 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 	/* VOP_REQZCBUF */
 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 	/* VOP_RETZCBUF */
 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 
 	return (vsp);
 }
 
 /*
  * Creates a kstat structure associated with a vopstats structure.
  */
 kstat_t *
 new_vskstat(char *ksname, vopstats_t *vsp)
 {
 	kstat_t		*ksp;
 
 	if (!vopstats_enabled) {
 		return (NULL);
 	}
 
 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 	if (ksp) {
 		ksp->ks_data = vsp;
 		kstat_install(ksp);
 	}
 
 	return (ksp);
 }
 
 /*
  * Called from vfsinit() to initialize the support mechanisms for vopstats
  */
 void
 vopstats_startup()
 {
 	if (!vopstats_enabled)
 		return;
 
 	/*
 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 	 * is necessary since we need to check if a kstat exists before we
 	 * attempt to create it.  Also, initialize its lock.
 	 */
 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 	    offsetof(vsk_anchor_t, vsk_node));
 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 	    NULL, NULL, 0);
 
 	/*
 	 * Set up the array of pointers for the vopstats-by-FS-type.
 	 * The entries will be allocated/initialized as each file system
 	 * goes through modload/mod_installfs.
 	 */
 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 
 	/* Set up the global vopstats initialization template */
 	vs_templatep = create_vopstats_template();
 }
 
 /*
  * We need to have the all of the counters zeroed.
  * The initialization of the vopstats_t includes on the order of
  * 50 calls to kstat_named_init().  Rather that do that on every call,
  * we do it once in a template (vs_templatep) then bcopy it over.
  */
 void
 initialize_vopstats(vopstats_t *vsp)
 {
 	if (vsp == NULL)
 		return;
 
 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 }
 
 /*
  * If possible, determine which vopstats by fstype to use and
  * return a pointer to the caller.
  */
 vopstats_t *
 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 {
 	int		fstype = 0;	/* Index into vfssw[] */
 	vopstats_t	*vsp = NULL;
 
 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 	    !vopstats_enabled)
 		return (NULL);
 	/*
 	 * Set up the fstype.  We go to so much trouble because all versions
 	 * of NFS use the same fstype in their vfs even though they have
 	 * distinct entries in the vfssw[] table.
 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 	 */
 	if (vswp) {
 		fstype = vswp - vfssw;	/* Gets us the index */
 	} else {
 		fstype = vfsp->vfs_fstype;
 	}
 
 	/*
 	 * Point to the per-fstype vopstats. The only valid values are
 	 * non-zero positive values less than the number of vfssw[] table
 	 * entries.
 	 */
 	if (fstype > 0 && fstype < nfstype) {
 		vsp = vopstats_fstype[fstype];
 	}
 
 	return (vsp);
 }
 
 /*
  * Generate a kstat name, create the kstat structure, and allocate a
  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
  * to the caller.  This must only be called from a mount.
  */
 vsk_anchor_t *
 get_vskstat_anchor(vfs_t *vfsp)
 {
 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
 	kstat_t		*ksp;			/* Ptr to new kstat */
 	avl_index_t	where;			/* Location in the AVL tree */
 
 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 		return (NULL);
 
 	/* Need to get the fsid to build a kstat name */
 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 		/* Create a name for our kstats based on fsid */
 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 		    VOPSTATS_STR, statvfsbuf.f_fsid);
 
 		/* Allocate and initialize the vsk_anchor_t */
 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 		bzero(vskp, sizeof (*vskp));
 		vskp->vsk_fsid = statvfsbuf.f_fsid;
 
 		mutex_enter(&vskstat_tree_lock);
 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 			avl_insert(&vskstat_tree, vskp, where);
 			mutex_exit(&vskstat_tree_lock);
 
 			/*
 			 * Now that we've got the anchor in the AVL
 			 * tree, we can create the kstat.
 			 */
 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 			if (ksp) {
 				vskp->vsk_ksp = ksp;
 			}
 		} else {
 			/* Oops, found one! Release memory and lock. */
 			mutex_exit(&vskstat_tree_lock);
 			kmem_cache_free(vsk_anchor_cache, vskp);
 			vskp = NULL;
 		}
 	}
 	return (vskp);
 }
 
 /*
  * We're in the process of tearing down the vfs and need to cleanup
  * the data structures associated with the vopstats. Must only be called
  * from dounmount().
  */
 void
 teardown_vopstats(vfs_t *vfsp)
 {
 	vsk_anchor_t	*vskap;
 	avl_index_t	where;
 
 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 		return;
 
 	/* This is a safe check since VFS_STATS must be set (see above) */
 	if ((vskap = vfsp->vfs_vskap) == NULL)
 		return;
 
 	/* Whack the pointer right away */
 	vfsp->vfs_vskap = NULL;
 
 	/* Lock the tree, remove the node, and delete the kstat */
 	mutex_enter(&vskstat_tree_lock);
 	if (avl_find(&vskstat_tree, vskap, &where)) {
 		avl_remove(&vskstat_tree, vskap);
 	}
 
 	if (vskap->vsk_ksp) {
 		kstat_delete(vskap->vsk_ksp);
 	}
 	mutex_exit(&vskstat_tree_lock);
 
 	kmem_cache_free(vsk_anchor_cache, vskap);
 }
 
 /*
  * Read or write a vnode.  Called from kernel code.
  */
 int
 vn_rdwr(
 	enum uio_rw rw,
 	struct vnode *vp,
 	caddr_t base,
 	ssize_t len,
 	offset_t offset,
 	enum uio_seg seg,
 	int ioflag,
 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
 	cred_t *cr,
 	ssize_t *residp)
 {
 	struct uio uio;
 	struct iovec iov;
 	int error;
 	int in_crit = 0;
 
 	if (rw == UIO_WRITE && ISROFILE(vp))
 		return (EROFS);
 
 	if (len < 0)
 		return (EIO);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	iov.iov_base = base;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_loffset = offset;
 	uio.uio_segflg = (short)seg;
 	uio.uio_resid = len;
 	uio.uio_llimit = ulimit;
 
 	/*
 	 * We have to enter the critical region before calling VOP_RWLOCK
 	 * to avoid a deadlock with ufs.
 	 */
 	if (nbl_need_check(vp)) {
 		int svmand;
 
 		nbl_start_crit(vp, RW_READER);
 		in_crit = 1;
 		error = nbl_svmand(vp, cr, &svmand);
 		if (error != 0)
 			goto done;
 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 			error = EACCES;
 			goto done;
 		}
 	}
 
 	(void) VOP_RWLOCK(vp,
 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 	if (rw == UIO_WRITE) {
 		uio.uio_fmode = FWRITE;
 		uio.uio_extflg = UIO_COPY_DEFAULT;
 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 	} else {
 		uio.uio_fmode = FREAD;
 		uio.uio_extflg = UIO_COPY_CACHED;
 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 	}
 	VOP_RWUNLOCK(vp,
 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 	if (residp)
 		*residp = uio.uio_resid;
 	else if (uio.uio_resid)
 		error = EIO;
 
 done:
 	if (in_crit)
 		nbl_end_crit(vp);
 	return (error);
 }
 
 /*
  * Release a vnode.  Call VOP_INACTIVE on last reference or
  * decrement reference count.
  *
  * To avoid race conditions, the v_count is left at 1 for
  * the call to VOP_INACTIVE. This prevents another thread
  * from reclaiming and releasing the vnode *before* the
  * VOP_INACTIVE routine has a chance to destroy the vnode.
  * We can't have more than 1 thread calling VOP_INACTIVE
  * on a vnode.
  */
 void
 vn_rele(vnode_t *vp)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VOP_INACTIVE(vp, CRED(), NULL);
 		return;
 	}
-	vp->v_count--;
+	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 /*
  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
  * as a single reference, so v_count is not decremented until the last DNLC hold
  * is released. This makes it possible to distinguish vnodes that are referenced
  * only by the DNLC.
  */
 void
 vn_rele_dnlc(vnode_t *vp)
 {
 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 	mutex_enter(&vp->v_lock);
 	if (--vp->v_count_dnlc == 0) {
 		if (vp->v_count == 1) {
 			mutex_exit(&vp->v_lock);
 			VOP_INACTIVE(vp, CRED(), NULL);
 			return;
 		}
-		vp->v_count--;
+		VN_RELE_LOCKED(vp);
 	}
 	mutex_exit(&vp->v_lock);
 }
 
 /*
  * Like vn_rele() except that it clears v_stream under v_lock.
- * This is used by sockfs when it dismantels the association between
- * the sockfs node and the vnode in the underlaying file system.
+ * This is used by sockfs when it dismantles the association between
+ * the sockfs node and the vnode in the underlying file system.
  * v_lock has to be held to prevent a thread coming through the lookupname
  * path from accessing a stream head that is going away.
  */
 void
 vn_rele_stream(vnode_t *vp)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	vp->v_stream = NULL;
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VOP_INACTIVE(vp, CRED(), NULL);
 		return;
 	}
-	vp->v_count--;
+	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 static void
 vn_rele_inactive(vnode_t *vp)
 {
 	VOP_INACTIVE(vp, CRED(), NULL);
 }
 
 /*
  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
  * the file system as a result of releasing the vnode. Note, file systems
  * already have to handle the race where the vnode is incremented before the
  * inactive routine is called and does its locking.
  *
  * Warning: Excessive use of this routine can lead to performance problems.
  * This is because taskqs throttle back allocation if too many are created.
  */
 void
 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
 		    vp, TQ_SLEEP) != NULL);
 		return;
 	}
-	vp->v_count--;
+	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 int
 vn_open(
 	char *pnamep,
 	enum uio_seg seg,
 	int filemode,
 	int createmode,
 	struct vnode **vpp,
 	enum create crwhy,
 	mode_t umask)
 {
 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 	    umask, NULL, -1));
 }
 
 
 /*
  * Open/create a vnode.
  * This may be callable by the kernel, the only known use
  * of user context being that the current user credentials
  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
  */
 int
 vn_openat(
 	char *pnamep,
 	enum uio_seg seg,
 	int filemode,
 	int createmode,
 	struct vnode **vpp,
 	enum create crwhy,
 	mode_t umask,
 	struct vnode *startvp,
 	int fd)
 {
 	struct vnode *vp;
 	int mode;
 	int accessflags;
 	int error;
 	int in_crit = 0;
 	int open_done = 0;
 	int shrlock_done = 0;
 	struct vattr vattr;
 	enum symfollow follow;
 	int estale_retry = 0;
 	struct shrlock shr;
 	struct shr_locowner shr_own;
 
 	mode = 0;
 	accessflags = 0;
 	if (filemode & FREAD)
 		mode |= VREAD;
 	if (filemode & (FWRITE|FTRUNC))
 		mode |= VWRITE;
 	if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 		mode |= VEXEC;
 
 	/* symlink interpretation */
 	if (filemode & FNOFOLLOW)
 		follow = NO_FOLLOW;
 	else
 		follow = FOLLOW;
 
 	if (filemode & FAPPEND)
 		accessflags |= V_APPEND;
 
 top:
 	if (filemode & FCREAT) {
 		enum vcexcl excl;
 
 		/*
 		 * Wish to create a file.
 		 */
 		vattr.va_type = VREG;
 		vattr.va_mode = createmode;
 		vattr.va_mask = AT_TYPE|AT_MODE;
 		if (filemode & FTRUNC) {
 			vattr.va_size = 0;
 			vattr.va_mask |= AT_SIZE;
 		}
 		if (filemode & FEXCL)
 			excl = EXCL;
 		else
 			excl = NONEXCL;
 
 		if (error =
 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
 			return (error);
 	} else {
 		/*
 		 * Wish to open a file.  Just look it up.
 		 */
 		if (error = lookupnameat(pnamep, seg, follow,
 		    NULLVPP, &vp, startvp)) {
 			if ((error == ESTALE) &&
 			    fs_need_estale_retry(estale_retry++))
 				goto top;
 			return (error);
 		}
 
 		/*
 		 * Get the attributes to check whether file is large.
 		 * We do this only if the FOFFMAX flag is not set and
 		 * only for regular files.
 		 */
 
 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
 			vattr.va_mask = AT_SIZE;
 			if ((error = VOP_GETATTR(vp, &vattr, 0,
 			    CRED(), NULL))) {
 				goto out;
 			}
 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
 				/*
 				 * Large File API - regular open fails
 				 * if FOFFMAX flag is set in file mode
 				 */
 				error = EOVERFLOW;
 				goto out;
 			}
 		}
 		/*
 		 * Can't write directories, active texts, or
 		 * read-only filesystems.  Can't truncate files
 		 * on which mandatory locking is in effect.
 		 */
 		if (filemode & (FWRITE|FTRUNC)) {
 			/*
 			 * Allow writable directory if VDIROPEN flag is set.
 			 */
 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
 				error = EISDIR;
 				goto out;
 			}
 			if (ISROFILE(vp)) {
 				error = EROFS;
 				goto out;
 			}
 			/*
 			 * Can't truncate files on which
 			 * sysv mandatory locking is in effect.
 			 */
 			if (filemode & FTRUNC) {
 				vnode_t *rvp;
 
 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
 					rvp = vp;
 				if (rvp->v_filocks != NULL) {
 					vattr.va_mask = AT_MODE;
 					if ((error = VOP_GETATTR(vp,
 					    &vattr, 0, CRED(), NULL)) == 0 &&
 					    MANDLOCK(vp, vattr.va_mode))
 						error = EAGAIN;
 				}
 			}
 			if (error)
 				goto out;
 		}
 		/*
 		 * Check permissions.
 		 */
 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
 			goto out;
 		/*
 		 * Require FSEARCH to return a directory.
 		 * Require FEXEC to return a regular file.
 		 */
 		if ((filemode & FSEARCH) && vp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		}
 		if ((filemode & FEXEC) && vp->v_type != VREG) {
 			error = ENOEXEC;	/* XXX: error code? */
 			goto out;
 		}
 	}
 
 	/*
 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
 	 */
 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
 		error = ELOOP;
 		goto out;
 	}
 	if (filemode & FNOLINKS) {
 		vattr.va_mask = AT_NLINK;
 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
 			goto out;
 		}
 		if (vattr.va_nlink != 1) {
 			error = EMLINK;
 			goto out;
 		}
 	}
 
 	/*
 	 * Opening a socket corresponding to the AF_UNIX pathname
 	 * in the filesystem name space is not supported.
 	 * However, VSOCK nodes in namefs are supported in order
 	 * to make fattach work for sockets.
 	 *
 	 * XXX This uses VOP_REALVP to distinguish between
 	 * an unopened namefs node (where VOP_REALVP returns a
 	 * different VSOCK vnode) and a VSOCK created by vn_create
 	 * in some file system (where VOP_REALVP would never return
 	 * a different vnode).
 	 */
 	if (vp->v_type == VSOCK) {
 		struct vnode *nvp;
 
 		error = VOP_REALVP(vp, &nvp, NULL);
 		if (error != 0 || nvp == NULL || nvp == vp ||
 		    nvp->v_type != VSOCK) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 
 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
 		/* get share reservation */
 		shr.s_access = 0;
 		if (filemode & FWRITE)
 			shr.s_access |= F_WRACC;
 		if (filemode & FREAD)
 			shr.s_access |= F_RDACC;
 		shr.s_deny = 0;
 		shr.s_sysid = 0;
 		shr.s_pid = ttoproc(curthread)->p_pid;
 		shr_own.sl_pid = shr.s_pid;
 		shr_own.sl_id = fd;
 		shr.s_own_len = sizeof (shr_own);
 		shr.s_owner = (caddr_t)&shr_own;
 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
 		    NULL);
 		if (error)
 			goto out;
 		shrlock_done = 1;
 
 		/* nbmand conflict check if truncating file */
 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
 			nbl_start_crit(vp, RW_READER);
 			in_crit = 1;
 
 			vattr.va_mask = AT_SIZE;
 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
 				goto out;
 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
 			    NULL)) {
 				error = EACCES;
 				goto out;
 			}
 		}
 	}
 
 	/*
 	 * Do opening protocol.
 	 */
 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
 	if (error)
 		goto out;
 	open_done = 1;
 
 	/*
 	 * Truncate if required.
 	 */
 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
 		vattr.va_size = 0;
 		vattr.va_mask = AT_SIZE;
 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
 			goto out;
 	}
 out:
 	ASSERT(vp->v_count > 0);
 
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (error) {
 		if (open_done) {
 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
 			    NULL);
 			open_done = 0;
 			shrlock_done = 0;
 		}
 		if (shrlock_done) {
 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
 			    NULL);
 			shrlock_done = 0;
 		}
 
 		/*
 		 * The following clause was added to handle a problem
 		 * with NFS consistency.  It is possible that a lookup
 		 * of the file to be opened succeeded, but the file
 		 * itself doesn't actually exist on the server.  This
 		 * is chiefly due to the DNLC containing an entry for
 		 * the file which has been removed on the server.  In
 		 * this case, we just start over.  If there was some
 		 * other cause for the ESTALE error, then the lookup
 		 * of the file will fail and the error will be returned
 		 * above instead of looping around from here.
 		 */
 		VN_RELE(vp);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 	} else
 		*vpp = vp;
 	return (error);
 }
 
 /*
  * The following two accessor functions are for the NFSv4 server.  Since there
  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
  * vnode open counts correct when a client "upgrades" an open or does an
  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
  * open mode (add or subtract read or write), but also change the share/deny
  * modes.  However, share reservations are not integrated with OPEN, yet, so
  * we need to handle each separately.  These functions are cleaner than having
  * the NFS server manipulate the counts directly, however, nobody else should
  * use these functions.
  */
 void
 vn_open_upgrade(
 	vnode_t *vp,
 	int filemode)
 {
 	ASSERT(vp->v_type == VREG);
 
 	if (filemode & FREAD)
 		atomic_inc_32(&vp->v_rdcnt);
 	if (filemode & FWRITE)
 		atomic_inc_32(&vp->v_wrcnt);
 
 }
 
 void
 vn_open_downgrade(
 	vnode_t *vp,
 	int filemode)
 {
 	ASSERT(vp->v_type == VREG);
 
 	if (filemode & FREAD) {
 		ASSERT(vp->v_rdcnt > 0);
 		atomic_dec_32(&vp->v_rdcnt);
 	}
 	if (filemode & FWRITE) {
 		ASSERT(vp->v_wrcnt > 0);
 		atomic_dec_32(&vp->v_wrcnt);
 	}
 
 }
 
 int
 vn_create(
 	char *pnamep,
 	enum uio_seg seg,
 	struct vattr *vap,
 	enum vcexcl excl,
 	int mode,
 	struct vnode **vpp,
 	enum create why,
 	int flag,
 	mode_t umask)
 {
 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
 	    umask, NULL));
 }
 
 /*
  * Create a vnode (makenode).
  */
 int
 vn_createat(
 	char *pnamep,
 	enum uio_seg seg,
 	struct vattr *vap,
 	enum vcexcl excl,
 	int mode,
 	struct vnode **vpp,
 	enum create why,
 	int flag,
 	mode_t umask,
 	struct vnode *startvp)
 {
 	struct vnode *dvp;	/* ptr to parent dir vnode */
 	struct vnode *vp = NULL;
 	struct pathname pn;
 	int error;
 	int in_crit = 0;
 	struct vattr vattr;
 	enum symfollow follow;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	/* symlink interpretation */
 	if ((flag & FNOFOLLOW) || excl == EXCL)
 		follow = NO_FOLLOW;
 	else
 		follow = FOLLOW;
 	flag &= ~(FNOFOLLOW|FNOLINKS);
 
 top:
 	/*
 	 * Lookup directory.
 	 * If new object is a file, call lower level to create it.
 	 * Note that it is up to the lower level to enforce exclusive
 	 * creation, if the file is already there.
 	 * This allows the lower level to do whatever
 	 * locking or protocol that is needed to prevent races.
 	 * If the new object is directory call lower level to make
 	 * the new directory, with "." and "..".
 	 */
 	if (error = pn_get(pnamep, seg, &pn))
 		return (error);
 	if (auditing)
 		audit_vncreate_start();
 	dvp = NULL;
 	*vpp = NULL;
 	/*
 	 * lookup will find the parent directory for the vnode.
 	 * When it is done the pn holds the name of the entry
 	 * in the directory.
 	 * If this is a non-exclusive create we also find the node itself.
 	 */
 	error = lookuppnat(&pn, NULL, follow, &dvp,
 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
 	if (error) {
 		pn_free(&pn);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 		if (why == CRMKDIR && error == EINVAL)
 			error = EEXIST;		/* SVID */
 		return (error);
 	}
 
 	if (why != CRMKNOD)
 		vap->va_mode &= ~VSVTX;
 
 	/*
 	 * If default ACLs are defined for the directory don't apply the
 	 * umask if umask is passed.
 	 */
 
 	if (umask) {
 
 		vsecattr_t vsec;
 
 		vsec.vsa_aclcnt = 0;
 		vsec.vsa_aclentp = NULL;
 		vsec.vsa_dfaclcnt = 0;
 		vsec.vsa_dfaclentp = NULL;
 		vsec.vsa_mask = VSA_DFACLCNT;
 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
 		/*
 		 * If error is ENOSYS then treat it as no error
 		 * Don't want to force all file systems to support
 		 * aclent_t style of ACL's.
 		 */
 		if (error == ENOSYS)
 			error = 0;
 		if (error) {
 			if (*vpp != NULL)
 				VN_RELE(*vpp);
 			goto out;
 		} else {
 			/*
 			 * Apply the umask if no default ACLs.
 			 */
 			if (vsec.vsa_dfaclcnt == 0)
 				vap->va_mode &= ~umask;
 
 			/*
 			 * VOP_GETSECATTR() may have allocated memory for
 			 * ACLs we didn't request, so double-check and
 			 * free it if necessary.
 			 */
 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
 				kmem_free((caddr_t)vsec.vsa_aclentp,
 				    vsec.vsa_aclcnt * sizeof (aclent_t));
 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
 		}
 	}
 
 	/*
 	 * In general we want to generate EROFS if the file system is
 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
 	 * documents the open system call, and it says that O_CREAT has no
 	 * effect if the file already exists.  Bug 1119649 states
 	 * that open(path, O_CREAT, ...) fails when attempting to open an
 	 * existing file on a read only file system.  Thus, the first part
 	 * of the following if statement has 3 checks:
 	 *	if the file exists &&
 	 *		it is being open with write access &&
 	 *		the file system is read only
 	 *	then generate EROFS
 	 */
 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
 		if (*vpp)
 			VN_RELE(*vpp);
 		error = EROFS;
 	} else if (excl == NONEXCL && *vpp != NULL) {
 		vnode_t *rvp;
 
 		/*
 		 * File already exists.  If a mandatory lock has been
 		 * applied, return error.
 		 */
 		vp = *vpp;
 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
 			rvp = vp;
 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
 			nbl_start_crit(vp, RW_READER);
 			in_crit = 1;
 		}
 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
 			vattr.va_mask = AT_MODE|AT_SIZE;
 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
 				goto out;
 			}
 			if (MANDLOCK(vp, vattr.va_mode)) {
 				error = EAGAIN;
 				goto out;
 			}
 			/*
 			 * File cannot be truncated if non-blocking mandatory
 			 * locks are currently on the file.
 			 */
 			if ((vap->va_mask & AT_SIZE) && in_crit) {
 				u_offset_t offset;
 				ssize_t length;
 
 				offset = vap->va_size > vattr.va_size ?
 				    vattr.va_size : vap->va_size;
 				length = vap->va_size > vattr.va_size ?
 				    vap->va_size - vattr.va_size :
 				    vattr.va_size - vap->va_size;
 				if (nbl_conflict(vp, NBL_WRITE, offset,
 				    length, 0, NULL)) {
 					error = EACCES;
 					goto out;
 				}
 			}
 		}
 
 		/*
 		 * If the file is the root of a VFS, we've crossed a
 		 * mount point and the "containing" directory that we
 		 * acquired above (dvp) is irrelevant because it's in
 		 * a different file system.  We apply VOP_CREATE to the
 		 * target itself instead of to the containing directory
 		 * and supply a null path name to indicate (conventionally)
 		 * the node itself as the "component" of interest.
 		 *
 		 * The intercession of the file system is necessary to
 		 * ensure that the appropriate permission checks are
 		 * done.
 		 */
 		if (vp->v_flag & VROOT) {
 			ASSERT(why != CRMKDIR);
 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
 			    CRED(), flag, NULL, NULL);
 			/*
 			 * If the create succeeded, it will have created
 			 * a new reference to the vnode.  Give up the
 			 * original reference.  The assertion should not
 			 * get triggered because NBMAND locks only apply to
 			 * VREG files.  And if in_crit is non-zero for some
 			 * reason, detect that here, rather than when we
 			 * deference a null vp.
 			 */
 			ASSERT(in_crit == 0);
 			VN_RELE(vp);
 			vp = NULL;
 			goto out;
 		}
 
 		/*
 		 * Large File API - non-large open (FOFFMAX flag not set)
 		 * of regular file fails if the file size exceeds MAXOFF32_T.
 		 */
 		if (why != CRMKDIR &&
 		    !(flag & FOFFMAX) &&
 		    (vp->v_type == VREG)) {
 			vattr.va_mask = AT_SIZE;
 			if ((error = VOP_GETATTR(vp, &vattr, 0,
 			    CRED(), NULL))) {
 				goto out;
 			}
 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
 				error = EOVERFLOW;
 				goto out;
 			}
 		}
 	}
 
 	if (error == 0) {
 		/*
 		 * Call mkdir() if specified, otherwise create().
 		 */
 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
 
 		if (why == CRMKDIR)
 			/*
 			 * N.B., if vn_createat() ever requests
 			 * case-insensitive behavior then it will need
 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
 			 * will already get it via "flag"
 			 */
 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
 			    NULL, 0, NULL);
 		else if (!must_be_dir)
 			error = VOP_CREATE(dvp, pn.pn_path, vap,
 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
 		else
 			error = ENOTDIR;
 	}
 
 out:
 
 	if (auditing)
 		audit_vncreate_finish(*vpp, error);
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (vp != NULL) {
 		VN_RELE(vp);
 		vp = NULL;
 	}
 	pn_free(&pn);
 	VN_RELE(dvp);
 	/*
 	 * The following clause was added to handle a problem
 	 * with NFS consistency.  It is possible that a lookup
 	 * of the file to be created succeeded, but the file
 	 * itself doesn't actually exist on the server.  This
 	 * is chiefly due to the DNLC containing an entry for
 	 * the file which has been removed on the server.  In
 	 * this case, we just start over.  If there was some
 	 * other cause for the ESTALE error, then the lookup
 	 * of the file will fail and the error will be returned
 	 * above instead of looping around from here.
 	 */
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 int
 vn_link(char *from, char *to, enum uio_seg seg)
 {
 	return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
 }
 
 int
 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
     vnode_t *tstartvp, char *to, enum uio_seg seg)
 {
 	struct vnode *fvp;		/* from vnode ptr */
 	struct vnode *tdvp;		/* to directory vnode ptr */
 	struct pathname pn;
 	int error;
 	struct vattr vattr;
 	dev_t fsid;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 top:
 	fvp = tdvp = NULL;
 	if (error = pn_get(to, seg, &pn))
 		return (error);
 	if (auditing && fstartvp != NULL)
 		audit_setfsat_path(1);
 	if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
 		goto out;
 	if (auditing && tstartvp != NULL)
 		audit_setfsat_path(3);
 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
 		goto out;
 	/*
 	 * Make sure both source vnode and target directory vnode are
 	 * in the same vfs and that it is writeable.
 	 */
 	vattr.va_mask = AT_FSID;
 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
 		goto out;
 	fsid = vattr.va_fsid;
 	vattr.va_mask = AT_FSID;
 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
 		goto out;
 	if (fsid != vattr.va_fsid) {
 		error = EXDEV;
 		goto out;
 	}
 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 	/*
 	 * Do the link.
 	 */
 	(void) pn_fixslash(&pn);
 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
 out:
 	pn_free(&pn);
 	if (fvp)
 		VN_RELE(fvp);
 	if (tdvp)
 		VN_RELE(tdvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 int
 vn_rename(char *from, char *to, enum uio_seg seg)
 {
 	return (vn_renameat(NULL, from, NULL, to, seg));
 }
 
 int
 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
 		char *tname, enum uio_seg seg)
 {
 	int error;
 	struct vattr vattr;
 	struct pathname fpn;		/* from pathname */
 	struct pathname tpn;		/* to pathname */
 	dev_t fsid;
 	int in_crit_src, in_crit_targ;
 	vnode_t *fromvp, *fvp;
 	vnode_t *tovp, *targvp;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 top:
 	fvp = fromvp = tovp = targvp = NULL;
 	in_crit_src = in_crit_targ = 0;
 	/*
 	 * Get to and from pathnames.
 	 */
 	if (error = pn_get(fname, seg, &fpn))
 		return (error);
 	if (error = pn_get(tname, seg, &tpn)) {
 		pn_free(&fpn);
 		return (error);
 	}
 
 	/*
 	 * First we need to resolve the correct directories
 	 * The passed in directories may only be a starting point,
 	 * but we need the real directories the file(s) live in.
 	 * For example the fname may be something like usr/lib/sparc
 	 * and we were passed in the / directory, but we need to
 	 * use the lib directory for the rename.
 	 */
 
 	if (auditing && fdvp != NULL)
 		audit_setfsat_path(1);
 	/*
 	 * Lookup to and from directories.
 	 */
 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
 		goto out;
 	}
 
 	/*
 	 * Make sure there is an entry.
 	 */
 	if (fvp == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 
 	if (auditing && tdvp != NULL)
 		audit_setfsat_path(3);
 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
 		goto out;
 	}
 
 	/*
 	 * Make sure both the from vnode directory and the to directory
 	 * are in the same vfs and the to directory is writable.
 	 * We check fsid's, not vfs pointers, so loopback fs works.
 	 */
 	if (fromvp != tovp) {
 		vattr.va_mask = AT_FSID;
 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
 			goto out;
 		fsid = vattr.va_fsid;
 		vattr.va_mask = AT_FSID;
 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
 			goto out;
 		if (fsid != vattr.va_fsid) {
 			error = EXDEV;
 			goto out;
 		}
 	}
 
 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 
 	if (targvp && (fvp != targvp)) {
 		nbl_start_crit(targvp, RW_READER);
 		in_crit_targ = 1;
 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	}
 
 	if (nbl_need_check(fvp)) {
 		nbl_start_crit(fvp, RW_READER);
 		in_crit_src = 1;
 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	}
 
 	/*
 	 * Do the rename.
 	 */
 	(void) pn_fixslash(&tpn);
 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
 	    NULL, 0);
 
 out:
 	pn_free(&fpn);
 	pn_free(&tpn);
 	if (in_crit_src)
 		nbl_end_crit(fvp);
 	if (in_crit_targ)
 		nbl_end_crit(targvp);
 	if (fromvp)
 		VN_RELE(fromvp);
 	if (tovp)
 		VN_RELE(tovp);
 	if (targvp)
 		VN_RELE(targvp);
 	if (fvp)
 		VN_RELE(fvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 /*
  * Remove a file or directory.
  */
 int
 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 	return (vn_removeat(NULL, fnamep, seg, dirflag));
 }
 
 int
 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 	struct vnode *vp;		/* entry vnode */
 	struct vnode *dvp;		/* ptr to parent dir vnode */
 	struct vnode *coveredvp;
 	struct pathname pn;		/* name of entry */
 	enum vtype vtype;
 	int error;
 	struct vfs *vfsp;
 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
 	int in_crit = 0;
 	int estale_retry = 0;
 
 top:
 	if (error = pn_get(fnamep, seg, &pn))
 		return (error);
 	dvp = vp = NULL;
 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
 		pn_free(&pn);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 		return (error);
 	}
 
 	/*
 	 * Make sure there is an entry.
 	 */
 	if (vp == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 
 	vfsp = vp->v_vfsp;
 	dvfsp = dvp->v_vfsp;
 
 	/*
 	 * If the named file is the root of a mounted filesystem, fail,
 	 * unless it's marked unlinkable.  In that case, unmount the
 	 * filesystem and proceed to unlink the covered vnode.  (If the
 	 * covered vnode is a directory, use rmdir instead of unlink,
 	 * to avoid file system corruption.)
 	 */
 	if (vp->v_flag & VROOT) {
 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
 			error = EBUSY;
 			goto out;
 		}
 
 		/*
 		 * Namefs specific code starts here.
 		 */
 
 		if (dirflag == RMDIRECTORY) {
 			/*
 			 * User called rmdir(2) on a file that has
 			 * been namefs mounted on top of.  Since
 			 * namefs doesn't allow directories to
 			 * be mounted on other files we know
 			 * vp is not of type VDIR so fail to operation.
 			 */
 			error = ENOTDIR;
 			goto out;
 		}
 
 		/*
 		 * If VROOT is still set after grabbing vp->v_lock,
 		 * noone has finished nm_unmount so far and coveredvp
 		 * is valid.
 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
 		 * vp->v_lock, any race window is eliminated.
 		 */
 
 		mutex_enter(&vp->v_lock);
 		if ((vp->v_flag & VROOT) == 0) {
 			/* Someone beat us to the unmount */
 			mutex_exit(&vp->v_lock);
 			error = EBUSY;
 			goto out;
 		}
 		vfsp = vp->v_vfsp;
 		coveredvp = vfsp->vfs_vnodecovered;
 		ASSERT(coveredvp);
 		/*
 		 * Note: Implementation of vn_vfswlock shows that ordering of
 		 * v_lock / vn_vfswlock is not an issue here.
 		 */
 		error = vn_vfswlock(coveredvp);
 		mutex_exit(&vp->v_lock);
 
 		if (error)
 			goto out;
 
 		VN_HOLD(coveredvp);
 		VN_RELE(vp);
 		error = dounmount(vfsp, 0, CRED());
 
 		/*
 		 * Unmounted the namefs file system; now get
 		 * the object it was mounted over.
 		 */
 		vp = coveredvp;
 		/*
 		 * If namefs was mounted over a directory, then
 		 * we want to use rmdir() instead of unlink().
 		 */
 		if (vp->v_type == VDIR)
 			dirflag = RMDIRECTORY;
 
 		if (error)
 			goto out;
 	}
 
 	/*
 	 * Make sure filesystem is writeable.
 	 * We check the parent directory's vfs in case this is an lofs vnode.
 	 */
 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 
 	vtype = vp->v_type;
 
 	/*
 	 * If there is the possibility of an nbmand share reservation, make
 	 * sure it's okay to remove the file.  Keep a reference to the
 	 * vnode, so that we can exit the nbl critical region after
 	 * calling VOP_REMOVE.
 	 * If there is no possibility of an nbmand share reservation,
 	 * release the vnode reference now.  Filesystems like NFS may
 	 * behave differently if there is an extra reference, so get rid of
 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
 	 * filesystems.
 	 */
 	if (nbl_need_check(vp)) {
 		nbl_start_crit(vp, RW_READER);
 		in_crit = 1;
 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	} else {
 		VN_RELE(vp);
 		vp = NULL;
 	}
 
 	if (dirflag == RMDIRECTORY) {
 		/*
 		 * Caller is using rmdir(2), which can only be applied to
 		 * directories.
 		 */
 		if (vtype != VDIR) {
 			error = ENOTDIR;
 		} else {
 			vnode_t *cwd;
 			proc_t *pp = curproc;
 
 			mutex_enter(&pp->p_lock);
 			cwd = PTOU(pp)->u_cdir;
 			VN_HOLD(cwd);
 			mutex_exit(&pp->p_lock);
 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
 			    NULL, 0);
 			VN_RELE(cwd);
 		}
 	} else {
 		/*
 		 * Unlink(2) can be applied to anything.
 		 */
 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
 	}
 
 out:
 	pn_free(&pn);
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (vp != NULL)
 		VN_RELE(vp);
 	if (dvp != NULL)
 		VN_RELE(dvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 /*
  * Utility function to compare equality of vnodes.
  * Compare the underlying real vnodes, if there are underlying vnodes.
  * This is a more thorough comparison than the VN_CMP() macro provides.
  */
 int
 vn_compare(vnode_t *vp1, vnode_t *vp2)
 {
 	vnode_t *realvp;
 
 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
 		vp1 = realvp;
 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
 		vp2 = realvp;
 	return (VN_CMP(vp1, vp2));
 }
 
 /*
  * The number of locks to hash into.  This value must be a power
  * of 2 minus 1 and should probably also be prime.
  */
 #define	NUM_BUCKETS	1023
 
 struct  vn_vfslocks_bucket {
 	kmutex_t vb_lock;
 	vn_vfslocks_entry_t *vb_list;
 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
 };
 
 /*
  * Total number of buckets will be NUM_BUCKETS + 1 .
  */
 
 #pragma	align	64(vn_vfslocks_buckets)
 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
 
 #define	VN_VFSLOCKS_SHIFT	9
 
 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
 
 /*
  * vn_vfslocks_getlock() uses an HASH scheme to generate
  * rwstlock using vfs/vnode pointer passed to it.
  *
  * vn_vfslocks_rele() releases a reference in the
  * HASH table which allows the entry allocated by
  * vn_vfslocks_getlock() to be freed at a later
  * stage when the refcount drops to zero.
  */
 
 vn_vfslocks_entry_t *
 vn_vfslocks_getlock(void *vfsvpptr)
 {
 	struct vn_vfslocks_bucket *bp;
 	vn_vfslocks_entry_t *vep;
 	vn_vfslocks_entry_t *tvep;
 
 	ASSERT(vfsvpptr != NULL);
 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
 
 	mutex_enter(&bp->vb_lock);
 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
 		if (vep->ve_vpvfs == vfsvpptr) {
 			vep->ve_refcnt++;
 			mutex_exit(&bp->vb_lock);
 			return (vep);
 		}
 	}
 	mutex_exit(&bp->vb_lock);
 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
 	vep->ve_vpvfs = (char *)vfsvpptr;
 	vep->ve_refcnt = 1;
 	mutex_enter(&bp->vb_lock);
 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
 		if (tvep->ve_vpvfs == vfsvpptr) {
 			tvep->ve_refcnt++;
 			mutex_exit(&bp->vb_lock);
 
 			/*
 			 * There is already an entry in the hash
 			 * destroy what we just allocated.
 			 */
 			rwst_destroy(&vep->ve_lock);
 			kmem_free(vep, sizeof (*vep));
 			return (tvep);
 		}
 	}
 	vep->ve_next = bp->vb_list;
 	bp->vb_list = vep;
 	mutex_exit(&bp->vb_lock);
 	return (vep);
 }
 
 void
 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
 {
 	struct vn_vfslocks_bucket *bp;
 	vn_vfslocks_entry_t *vep;
 	vn_vfslocks_entry_t *pvep;
 
 	ASSERT(vepent != NULL);
 	ASSERT(vepent->ve_vpvfs != NULL);
 
 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
 
 	mutex_enter(&bp->vb_lock);
 	vepent->ve_refcnt--;
 
 	if ((int32_t)vepent->ve_refcnt < 0)
 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
 
 	if (vepent->ve_refcnt == 0) {
 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
 				if (bp->vb_list == vep)
 					bp->vb_list = vep->ve_next;
 				else {
 					/* LINTED */
 					pvep->ve_next = vep->ve_next;
 				}
 				mutex_exit(&bp->vb_lock);
 				rwst_destroy(&vep->ve_lock);
 				kmem_free(vep, sizeof (*vep));
 				return;
 			}
 			pvep = vep;
 		}
 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
 	}
 	mutex_exit(&bp->vb_lock);
 }
 
 /*
  * vn_vfswlock_wait is used to implement a lock which is logically a writers
  * lock protecting the v_vfsmountedhere field.
  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
  * except that it blocks to acquire the lock VVFSLOCK.
  *
  * traverse() and routines re-implementing part of traverse (e.g. autofs)
  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
  * need the non-blocking version of the writers lock i.e. vn_vfswlock
  */
 int
 vn_vfswlock_wait(vnode_t *vp)
 {
 	int retval;
 	vn_vfslocks_entry_t *vpvfsentry;
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
 
 	if (retval == EINTR) {
 		vn_vfslocks_rele(vpvfsentry);
 		return (EINTR);
 	}
 	return (retval);
 }
 
 int
 vn_vfsrlock_wait(vnode_t *vp)
 {
 	int retval;
 	vn_vfslocks_entry_t *vpvfsentry;
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
 
 	if (retval == EINTR) {
 		vn_vfslocks_rele(vpvfsentry);
 		return (EINTR);
 	}
 
 	return (retval);
 }
 
 
 /*
  * vn_vfswlock is used to implement a lock which is logically a writers lock
  * protecting the v_vfsmountedhere field.
  */
 int
 vn_vfswlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * If vp is NULL then somebody is trying to lock the covered vnode
 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
 	 * only happen when unmounting /.  Since that operation will fail
 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
 	 */
 	if (vp == NULL)
 		return (EBUSY);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 
 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
 		return (0);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (EBUSY);
 }
 
 int
 vn_vfsrlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * If vp is NULL then somebody is trying to lock the covered vnode
 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
 	 * only happen when unmounting /.  Since that operation will fail
 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
 	 */
 	if (vp == NULL)
 		return (EBUSY);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 
 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
 		return (0);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (EBUSY);
 }
 
 void
 vn_vfsunlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * ve_refcnt needs to be decremented twice.
 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
 	 * 2. To release the reference from the locking routines like
 	 *    vn_vfsrlock/vn_vfswlock etc,.
 	 */
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	vn_vfslocks_rele(vpvfsentry);
 
 	rwst_exit(&vpvfsentry->ve_lock);
 	vn_vfslocks_rele(vpvfsentry);
 }
 
 int
 vn_vfswlock_held(vnode_t *vp)
 {
 	int held;
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (held);
 }
 
 
 int
 vn_make_ops(
 	const char *name,			/* Name of file system */
 	const fs_operation_def_t *templ,	/* Operation specification */
 	vnodeops_t **actual)			/* Return the vnodeops */
 {
 	int unused_ops;
 	int error;
 
 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
 
 	(*actual)->vnop_name = name;
 
 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
 	if (error) {
 		kmem_free(*actual, sizeof (vnodeops_t));
 	}
 
 #if DEBUG
 	if (unused_ops != 0)
 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
 		    "but not used", name, unused_ops);
 #endif
 
 	return (error);
 }
 
 /*
  * Free the vnodeops created as a result of vn_make_ops()
  */
 void
 vn_freevnodeops(vnodeops_t *vnops)
 {
 	kmem_free(vnops, sizeof (vnodeops_t));
 }
 
 /*
  * Vnode cache.
  */
 
 /* ARGSUSED */
 static int
 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
 	struct vnode *vp;
 
 	vp = buf;
 
 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
 	vp->v_path = NULL;
 	vp->v_mpssdata = NULL;
 	vp->v_vsd = NULL;
 	vp->v_fopdata = NULL;
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 vn_cache_destructor(void *buf, void *cdrarg)
 {
 	struct vnode *vp;
 
 	vp = buf;
 
 	rw_destroy(&vp->v_nbllock);
 	cv_destroy(&vp->v_cv);
 	mutex_destroy(&vp->v_vsd_lock);
 	mutex_destroy(&vp->v_lock);
 }
 
 void
 vn_create_cache(void)
 {
 	/* LINTED */
 	ASSERT((1 << VNODE_ALIGN_LOG2) ==
 	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
 	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
 	    NULL, 0);
 }
 
 void
 vn_destroy_cache(void)
 {
 	kmem_cache_destroy(vn_cache);
 }
 
 /*
  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
  * cached by the file system and vnodes remain associated.
  */
 void
 vn_recycle(vnode_t *vp)
 {
 	ASSERT(vp->v_pages == NULL);
 
 	/*
 	 * XXX - This really belongs in vn_reinit(), but we have some issues
 	 * with the counts.  Best to have it here for clean initialization.
 	 */
 	vp->v_rdcnt = 0;
 	vp->v_wrcnt = 0;
 	vp->v_mmap_read = 0;
 	vp->v_mmap_write = 0;
 
 	/*
 	 * If FEM was in use, make sure everything gets cleaned up
 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
 	 * constructor.
 	 */
 	if (vp->v_femhead) {
 		/* XXX - There should be a free_femhead() that does all this */
 		ASSERT(vp->v_femhead->femh_list == NULL);
 		mutex_destroy(&vp->v_femhead->femh_lock);
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
 	if (vp->v_path) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
 		vp->v_path = NULL;
 	}
 
 	if (vp->v_fopdata != NULL) {
 		free_fopdata(vp);
 	}
 	vp->v_mpssdata = NULL;
 	vsd_free(vp);
 }
 
 /*
  * Used to reset the vnode fields including those that are directly accessible
  * as well as those which require an accessor function.
  *
  * Does not initialize:
  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
  *	v_data (since FS-nodes and vnodes point to each other and should
  *		be updated simultaneously)
  *	v_op (in case someone needs to make a VOP call on this object)
  */
 void
 vn_reinit(vnode_t *vp)
 {
 	vp->v_count = 1;
 	vp->v_count_dnlc = 0;
 	vp->v_vfsp = NULL;
 	vp->v_stream = NULL;
 	vp->v_vfsmountedhere = NULL;
 	vp->v_flag = 0;
 	vp->v_type = VNON;
 	vp->v_rdev = NODEV;
 
 	vp->v_filocks = NULL;
 	vp->v_shrlocks = NULL;
 	vp->v_pages = NULL;
 
 	vp->v_locality = NULL;
 	vp->v_xattrdir = NULL;
 
 	/* Handles v_femhead, v_path, and the r/w/map counts */
 	vn_recycle(vp);
 }
 
 vnode_t *
 vn_alloc(int kmflag)
 {
 	vnode_t *vp;
 
 	vp = kmem_cache_alloc(vn_cache, kmflag);
 
 	if (vp != NULL) {
 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
 		vp->v_fopdata = NULL;
 		vn_reinit(vp);
 	}
 
 	return (vp);
 }
 
 void
 vn_free(vnode_t *vp)
 {
 	ASSERT(vp->v_shrlocks == NULL);
 	ASSERT(vp->v_filocks == NULL);
 
 	/*
 	 * Some file systems call vn_free() with v_count of zero,
 	 * some with v_count of 1.  In any case, the value should
 	 * never be anything else.
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
 	if (vp->v_path != NULL) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
 		vp->v_path = NULL;
 	}
 
 	/* If FEM was in use, make sure everything gets cleaned up */
 	if (vp->v_femhead) {
 		/* XXX - There should be a free_femhead() that does all this */
 		ASSERT(vp->v_femhead->femh_list == NULL);
 		mutex_destroy(&vp->v_femhead->femh_lock);
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
 
 	if (vp->v_fopdata != NULL) {
 		free_fopdata(vp);
 	}
 	vp->v_mpssdata = NULL;
 	vsd_free(vp);
 	kmem_cache_free(vn_cache, vp);
 }
 
 /*
  * vnode status changes, should define better states than 1, 0.
  */
 void
 vn_reclaim(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
 }
 
 void
 vn_idle(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
 }
 void
 vn_exists(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
 }
 
 void
 vn_invalid(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
 }
 
 /* Vnode event notification */
 
 int
 vnevent_support(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL)
 		return (EINVAL);
 
 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
 }
 
 void
 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
 }
 
 void
 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
 }
 
 void
 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
 }
 
 void
 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
 }
 
 void
 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
 }
 
 void
 vnevent_create(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
 }
 
 void
 vnevent_link(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
 }
 
 void
 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
 }
 
 void
 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
 }
 
 /*
  * Vnode accessors.
  */
 
 int
 vn_is_readonly(vnode_t *vp)
 {
 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
 }
 
 int
 vn_has_flocks(vnode_t *vp)
 {
 	return (vp->v_filocks != NULL);
 }
 
 int
 vn_has_mandatory_locks(vnode_t *vp, int mode)
 {
 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
 }
 
 int
 vn_has_cached_data(vnode_t *vp)
 {
 	return (vp->v_pages != NULL);
 }
 
 /*
  * Return 0 if the vnode in question shouldn't be permitted into a zone via
  * zone_enter(2).
  */
 int
 vn_can_change_zones(vnode_t *vp)
 {
 	struct vfssw *vswp;
 	int allow = 1;
 	vnode_t *rvp;
 
 	if (nfs_global_client_only != 0)
 		return (1);
 
 	/*
 	 * We always want to look at the underlying vnode if there is one.
 	 */
 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
 		rvp = vp;
 	/*
 	 * Some pseudo filesystems (including doorfs) don't actually register
 	 * their vfsops_t, so the following may return NULL; we happily let
 	 * such vnodes switch zones.
 	 */
 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
 	if (vswp != NULL) {
 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
 			allow = 0;
 		vfs_unrefvfssw(vswp);
 	}
 	return (allow);
 }
 
 /*
  * Return nonzero if the vnode is a mount point, zero if not.
  */
 int
 vn_ismntpt(vnode_t *vp)
 {
 	return (vp->v_vfsmountedhere != NULL);
 }
 
 /* Retrieve the vfs (if any) mounted on this vnode */
 vfs_t *
 vn_mountedvfs(vnode_t *vp)
 {
 	return (vp->v_vfsmountedhere);
 }
 
 /*
  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
  */
 int
 vn_in_dnlc(vnode_t *vp)
 {
 	return (vp->v_count_dnlc > 0);
 }
 
 /*
  * vn_has_other_opens() checks whether a particular file is opened by more than
  * just the caller and whether the open is for read and/or write.
  * This routine is for calling after the caller has already called VOP_OPEN()
  * and the caller wishes to know if they are the only one with it open for
  * the mode(s) specified.
  *
  * Vnode counts are only kept on regular files (v_type=VREG).
  */
 int
 vn_has_other_opens(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_wrcnt > 1)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_rdcnt > 1)
 			return (V_TRUE);
 		break;
 	}
 
 	return (V_FALSE);
 }
 
 /*
  * vn_is_opened() checks whether a particular file is opened and
  * whether the open is for read and/or write.
  *
  * Vnode counts are only kept on regular files (v_type=VREG).
  */
 int
 vn_is_opened(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if (vp->v_rdcnt && vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if (vp->v_rdcnt || vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_rdcnt)
 			return (V_TRUE);
 		break;
 	}
 
 	return (V_FALSE);
 }
 
 /*
  * vn_is_mapped() checks whether a particular file is mapped and whether
  * the file is mapped read and/or write.
  */
 int
 vn_is_mapped(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 #if !defined(_LP64)
 	switch (mode) {
 	/*
 	 * The atomic_add_64_nv functions force atomicity in the
 	 * case of 32 bit architectures. Otherwise the 64 bit values
 	 * require two fetches. The value of the fields may be
 	 * (potentially) changed between the first fetch and the
 	 * second
 	 */
 	case V_WRITE:
 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
 			return (V_TRUE);
 		break;
 	}
 #else
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if (vp->v_mmap_read && vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if (vp->v_mmap_read || vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_mmap_read)
 			return (V_TRUE);
 		break;
 	}
 #endif
 
 	return (V_FALSE);
 }
 
 /*
  * Set the operations vector for a vnode.
  *
  * FEM ensures that the v_femhead pointer is filled in before the
  * v_op pointer is changed.  This means that if the v_femhead pointer
  * is NULL, and the v_op field hasn't changed since before which checked
  * the v_femhead pointer; then our update is ok - we are not racing with
  * FEM.
  */
 void
 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
 {
 	vnodeops_t	*op;
 
 	ASSERT(vp != NULL);
 	ASSERT(vnodeops != NULL);
 
 	op = vp->v_op;
 	membar_consumer();
 	/*
 	 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
 	 * the compare-and-swap on vp->v_op.  If either fails, then FEM is
 	 * in effect on the vnode and we need to have FEM deal with it.
 	 */
 	if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
 	    op) {
 		fem_setvnops(vp, vnodeops);
 	}
 }
 
 /*
  * Retrieve the operations vector for a vnode
  * As with vn_setops(above); make sure we aren't racing with FEM.
  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
  * make sense to the callers of this routine.
  */
 vnodeops_t *
 vn_getops(vnode_t *vp)
 {
 	vnodeops_t	*op;
 
 	ASSERT(vp != NULL);
 
 	op = vp->v_op;
 	membar_consumer();
 	if (vp->v_femhead == NULL && op == vp->v_op) {
 		return (op);
 	} else {
 		return (fem_getvnops(vp));
 	}
 }
 
 /*
  * Returns non-zero (1) if the vnodeops matches that of the vnode.
  * Returns zero (0) if not.
  */
 int
 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
 {
 	return (vn_getops(vp) == vnodeops);
 }
 
 /*
  * Returns non-zero (1) if the specified operation matches the
  * corresponding operation for that the vnode.
  * Returns zero (0) if not.
  */
 
 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
 
 int
 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
 {
 	const fs_operation_trans_def_t *otdp;
 	fs_generic_func_p *loc = NULL;
 	vnodeops_t	*vop = vn_getops(vp);
 
 	ASSERT(vopname != NULL);
 
 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
 		if (MATCHNAME(otdp->name, vopname)) {
 			loc = (fs_generic_func_p *)
 			    ((char *)(vop) + otdp->offset);
 			break;
 		}
 	}
 
 	return ((loc != NULL) && (*loc == funcp));
 }
 
 /*
  * fs_new_caller_id() needs to return a unique ID on a given local system.
  * The IDs do not need to survive across reboots.  These are primarily
  * used so that (FEM) monitors can detect particular callers (such as
  * the NFS server) to a given vnode/vfs operation.
  */
 u_longlong_t
 fs_new_caller_id()
 {
 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
 
 	return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
 }
 
 /*
  * Given a starting vnode and a path, updates the path in the target vnode in
  * a safe manner.  If the vnode already has path information embedded, then the
  * cached path is left untouched.
  */
 
 size_t max_vnode_path = 4 * MAXPATHLEN;
 
 void
 vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
     const char *path, size_t plen)
 {
 	char	*rpath;
 	vnode_t	*base;
 	size_t	rpathlen, rpathalloc;
 	int	doslash = 1;
 
 	if (*path == '/') {
 		base = rootvp;
 		path++;
 		plen--;
 	} else {
 		base = startvp;
 	}
 
 	/*
 	 * We cannot grab base->v_lock while we hold vp->v_lock because of
 	 * the potential for deadlock.
 	 */
 	mutex_enter(&base->v_lock);
 	if (base->v_path == NULL) {
 		mutex_exit(&base->v_lock);
 		return;
 	}
 
 	rpathlen = strlen(base->v_path);
 	rpathalloc = rpathlen + plen + 1;
 	/* Avoid adding a slash if there's already one there */
 	if (base->v_path[rpathlen-1] == '/')
 		doslash = 0;
 	else
 		rpathalloc++;
 
 	/*
 	 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
 	 * so we must do this dance.  If, by chance, something changes the path,
 	 * just give up since there is no real harm.
 	 */
 	mutex_exit(&base->v_lock);
 
 	/* Paths should stay within reason */
 	if (rpathalloc > max_vnode_path)
 		return;
 
 	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
 
 	mutex_enter(&base->v_lock);
 	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
 		mutex_exit(&base->v_lock);
 		kmem_free(rpath, rpathalloc);
 		return;
 	}
 	bcopy(base->v_path, rpath, rpathlen);
 	mutex_exit(&base->v_lock);
 
 	if (doslash)
 		rpath[rpathlen++] = '/';
 	bcopy(path, rpath + rpathlen, plen);
 	rpath[rpathlen + plen] = '\0';
 
 	mutex_enter(&vp->v_lock);
 	if (vp->v_path != NULL) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(rpath, rpathalloc);
 	} else {
 		vp->v_path = rpath;
 		mutex_exit(&vp->v_lock);
 	}
 }
 
 /*
  * Sets the path to the vnode to be the given string, regardless of current
  * context.  The string must be a complete path from rootdir.  This is only used
  * by fsop_root() for setting the path based on the mountpoint.
  */
 void
 vn_setpath_str(struct vnode *vp, const char *str, size_t len)
 {
 	char *buf = kmem_alloc(len + 1, KM_SLEEP);
 
 	mutex_enter(&vp->v_lock);
 	if (vp->v_path != NULL) {
 		mutex_exit(&vp->v_lock);
 		kmem_free(buf, len + 1);
 		return;
 	}
 
 	vp->v_path = buf;
 	bcopy(str, vp->v_path, len);
 	vp->v_path[len] = '\0';
 
 	mutex_exit(&vp->v_lock);
 }
 
 /*
  * Called from within filesystem's vop_rename() to handle renames once the
  * target vnode is available.
  */
 void
 vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
 {
 	char *tmp;
 
 	mutex_enter(&vp->v_lock);
 	tmp = vp->v_path;
 	vp->v_path = NULL;
 	mutex_exit(&vp->v_lock);
 	vn_setpath(rootdir, dvp, vp, nm, len);
 	if (tmp != NULL)
 		kmem_free(tmp, strlen(tmp) + 1);
 }
 
 /*
  * Similar to vn_setpath_str(), this function sets the path of the destination
  * vnode to the be the same as the source vnode.
  */
 void
 vn_copypath(struct vnode *src, struct vnode *dst)
 {
 	char *buf;
 	int alloc;
 
 	mutex_enter(&src->v_lock);
 	if (src->v_path == NULL) {
 		mutex_exit(&src->v_lock);
 		return;
 	}
 	alloc = strlen(src->v_path) + 1;
 
 	/* avoid kmem_alloc() with lock held */
 	mutex_exit(&src->v_lock);
 	buf = kmem_alloc(alloc, KM_SLEEP);
 	mutex_enter(&src->v_lock);
 	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
 		mutex_exit(&src->v_lock);
 		kmem_free(buf, alloc);
 		return;
 	}
 	bcopy(src->v_path, buf, alloc);
 	mutex_exit(&src->v_lock);
 
 	mutex_enter(&dst->v_lock);
 	if (dst->v_path != NULL) {
 		mutex_exit(&dst->v_lock);
 		kmem_free(buf, alloc);
 		return;
 	}
 	dst->v_path = buf;
 	mutex_exit(&dst->v_lock);
 }
 
 /*
  * XXX Private interface for segvn routines that handle vnode
  * large page segments.
  *
  * return 1 if vp's file system VOP_PAGEIO() implementation
  * can be safely used instead of VOP_GETPAGE() for handling
  * pagefaults against regular non swap files. VOP_PAGEIO()
  * interface is considered safe here if its implementation
  * is very close to VOP_GETPAGE() implementation.
  * e.g. It zero's out the part of the page beyond EOF. Doesn't
  * panic if there're file holes but instead returns an error.
  * Doesn't assume file won't be changed by user writes, etc.
  *
  * return 0 otherwise.
  *
  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
  */
 int
 vn_vmpss_usepageio(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
 	char **fsok = pageio_ok_fss;
 
 	if (fsname == NULL) {
 		return (0);
 	}
 
 	for (; *fsok; fsok++) {
 		if (strcmp(*fsok, fsname) == 0) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /* VOP_XXX() macros call the corresponding fop_xxx() function */
 
 int
 fop_open(
 	vnode_t **vpp,
 	int mode,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int ret;
 	vnode_t *vp = *vpp;
 
 	VN_HOLD(vp);
 	/*
 	 * Adding to the vnode counts before calling open
 	 * avoids the need for a mutex. It circumvents a race
 	 * condition where a query made on the vnode counts results in a
 	 * false negative. The inquirer goes away believing the file is
 	 * not open when there is an open on the file already under way.
 	 *
 	 * The counts are meant to prevent NFS from granting a delegation
 	 * when it would be dangerous to do so.
 	 *
 	 * The vnode counts are only kept on regular files
 	 */
 	if ((*vpp)->v_type == VREG) {
 		if (mode & FREAD)
 			atomic_inc_32(&(*vpp)->v_rdcnt);
 		if (mode & FWRITE)
 			atomic_inc_32(&(*vpp)->v_wrcnt);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
 
 	if (ret) {
 		/*
 		 * Use the saved vp just in case the vnode ptr got trashed
 		 * by the error.
 		 */
 		VOPSTATS_UPDATE(vp, open);
 		if ((vp->v_type == VREG) && (mode & FREAD))
 			atomic_dec_32(&vp->v_rdcnt);
 		if ((vp->v_type == VREG) && (mode & FWRITE))
 			atomic_dec_32(&vp->v_wrcnt);
 	} else {
 		/*
 		 * Some filesystems will return a different vnode,
 		 * but the same path was still used to open it.
 		 * So if we do change the vnode and need to
 		 * copy over the path, do so here, rather than special
 		 * casing each filesystem. Adjust the vnode counts to
 		 * reflect the vnode switch.
 		 */
 		VOPSTATS_UPDATE(*vpp, open);
 		if (*vpp != vp && *vpp != NULL) {
 			vn_copypath(vp, *vpp);
 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
 				atomic_inc_32(&(*vpp)->v_rdcnt);
 			if ((vp->v_type == VREG) && (mode & FREAD))
 				atomic_dec_32(&vp->v_rdcnt);
 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
 				atomic_inc_32(&(*vpp)->v_wrcnt);
 			if ((vp->v_type == VREG) && (mode & FWRITE))
 				atomic_dec_32(&vp->v_wrcnt);
 		}
 	}
 	VN_RELE(vp);
 	return (ret);
 }
 
 int
 fop_close(
 	vnode_t *vp,
 	int flag,
 	int count,
 	offset_t offset,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
 	VOPSTATS_UPDATE(vp, close);
 	/*
 	 * Check passed in count to handle possible dups. Vnode counts are only
 	 * kept on regular files
 	 */
 	if ((vp->v_type == VREG) && (count == 1))  {
 		if (flag & FREAD) {
 			ASSERT(vp->v_rdcnt > 0);
 			atomic_dec_32(&vp->v_rdcnt);
 		}
 		if (flag & FWRITE) {
 			ASSERT(vp->v_wrcnt > 0);
 			atomic_dec_32(&vp->v_wrcnt);
 		}
 	}
 	return (err);
 }
 
 int
 fop_read(
 	vnode_t *vp,
 	uio_t *uiop,
 	int ioflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
 	VOPSTATS_UPDATE_IO(vp, read,
 	    read_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_write(
 	vnode_t *vp,
 	uio_t *uiop,
 	int ioflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
 	VOPSTATS_UPDATE_IO(vp, write,
 	    write_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_ioctl(
 	vnode_t *vp,
 	int cmd,
 	intptr_t arg,
 	int flag,
 	cred_t *cr,
 	int *rvalp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
 	VOPSTATS_UPDATE(vp, ioctl);
 	return (err);
 }
 
 int
 fop_setfl(
 	vnode_t *vp,
 	int oflags,
 	int nflags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
 	VOPSTATS_UPDATE(vp, setfl);
 	return (err);
 }
 
 int
 fop_getattr(
 	vnode_t *vp,
 	vattr_t *vap,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * If this file system doesn't understand the xvattr extensions
 	 * then turn off the xvattr bit.
 	 */
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
 		vap->va_mask &= ~AT_XVATTR;
 	}
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flags & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, getattr);
 	return (err);
 }
 
 int
 fop_setattr(
 	vnode_t *vp,
 	vattr_t *vap,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * If this file system doesn't understand the xvattr extensions
 	 * then turn off the xvattr bit.
 	 */
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
 		vap->va_mask &= ~AT_XVATTR;
 	}
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flags & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, setattr);
 	return (err);
 }
 
 int
 fop_access(
 	vnode_t *vp,
 	int mode,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	if ((flags & V_ACE_MASK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, access);
 	return (err);
 }
 
 int
 fop_lookup(
 	vnode_t *dvp,
 	char *nm,
 	vnode_t **vpp,
 	pathname_t *pnp,
 	int flags,
 	vnode_t *rdir,
 	cred_t *cr,
 	caller_context_t *ct,
 	int *deflags,		/* Returned per-dirent flags */
 	pathname_t *ppnp)	/* Returned case-preserved name in directory */
 {
 	int ret;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.  It is required
 	 * that if the vfs supports case-insensitive lookup, it also
 	 * supports extended dirent flags.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
 		ret = xattr_dir_lookup(dvp, vpp, flags, cr);
 	} else {
 		ret = (*(dvp)->v_op->vop_lookup)
 		    (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
 	}
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, lookup);
 		if ((*vpp)->v_path == NULL) {
 			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
 		}
 	}
 
 	return (ret);
 }
 
 int
 fop_create(
 	vnode_t *dvp,
 	char *name,
 	vattr_t *vap,
 	vcexcl_t excl,
 	int mode,
 	vnode_t **vpp,
 	cred_t *cr,
 	int flags,
 	caller_context_t *ct,
 	vsecattr_t *vsecp)	/* ACL to set during create */
 {
 	int ret;
 
 	if (vsecp != NULL &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
 		return (EINVAL);
 	}
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	ret = (*(dvp)->v_op->vop_create)
 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, create);
 		if ((*vpp)->v_path == NULL) {
 			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
 		}
 	}
 
 	return (ret);
 }
 
 int
 fop_remove(
 	vnode_t *dvp,
 	char *nm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, remove);
 	return (err);
 }
 
 int
 fop_link(
 	vnode_t *tdvp,
 	vnode_t *svp,
 	char *tnm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If the target file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(tdvp, cr);
 
 	err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
 	VOPSTATS_UPDATE(tdvp, link);
 	return (err);
 }
 
 int
 fop_rename(
 	vnode_t *sdvp,
 	char *snm,
 	vnode_t *tdvp,
 	char *tnm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If the file system involved does not support
 	 * case-insensitive access and said access is requested, fail
 	 * quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(tdvp, cr);
 
 	err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
 	VOPSTATS_UPDATE(sdvp, rename);
 	return (err);
 }
 
 int
 fop_mkdir(
 	vnode_t *dvp,
 	char *dirname,
 	vattr_t *vap,
 	vnode_t **vpp,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags,
 	vsecattr_t *vsecp)	/* ACL to set during create */
 {
 	int ret;
 
 	if (vsecp != NULL &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
 		return (EINVAL);
 	}
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	ret = (*(dvp)->v_op->vop_mkdir)
 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, mkdir);
 		if ((*vpp)->v_path == NULL) {
 			vn_setpath(rootdir, dvp, *vpp, dirname,
 			    strlen(dirname));
 		}
 	}
 
 	return (ret);
 }
 
 int
 fop_rmdir(
 	vnode_t *dvp,
 	char *nm,
 	vnode_t *cdir,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, rmdir);
 	return (err);
 }
 
 int
 fop_readdir(
 	vnode_t *vp,
 	uio_t *uiop,
 	cred_t *cr,
 	int *eofp,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	/*
 	 * If this file system doesn't support retrieving directory
 	 * entry flags and said access is requested, fail quickly.
 	 */
 	if (flags & V_RDDIR_ENTFLAGS &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
 		return (EINVAL);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
 	VOPSTATS_UPDATE_IO(vp, readdir,
 	    readdir_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_symlink(
 	vnode_t *dvp,
 	char *linkname,
 	vattr_t *vap,
 	char *target,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 	xvattr_t xvattr;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	/* check for reparse point */
 	if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
 	    (strncmp(target, FS_REPARSE_TAG_STR,
 	    strlen(FS_REPARSE_TAG_STR)) == 0)) {
 		if (!fs_reparse_mark(target, vap, &xvattr))
 			vap = (vattr_t *)&xvattr;
 	}
 
 	err = (*(dvp)->v_op->vop_symlink)
 	    (dvp, linkname, vap, target, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, symlink);
 	return (err);
 }
 
 int
 fop_readlink(
 	vnode_t *vp,
 	uio_t *uiop,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, readlink);
 	return (err);
 }
 
 int
 fop_fsync(
 	vnode_t *vp,
 	int syncflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
 	VOPSTATS_UPDATE(vp, fsync);
 	return (err);
 }
 
 void
 fop_inactive(
 	vnode_t *vp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	/* Need to update stats before vop call since we may lose the vnode */
 	VOPSTATS_UPDATE(vp, inactive);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	(*(vp)->v_op->vop_inactive)(vp, cr, ct);
 }
 
 int
 fop_fid(
 	vnode_t *vp,
 	fid_t *fidp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
 	VOPSTATS_UPDATE(vp, fid);
 	return (err);
 }
 
 int
 fop_rwlock(
 	vnode_t *vp,
 	int write_lock,
 	caller_context_t *ct)
 {
 	int	ret;
 
 	ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
 	VOPSTATS_UPDATE(vp, rwlock);
 	return (ret);
 }
 
 void
 fop_rwunlock(
 	vnode_t *vp,
 	int write_lock,
 	caller_context_t *ct)
 {
 	(*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
 	VOPSTATS_UPDATE(vp, rwunlock);
 }
 
 int
 fop_seek(
 	vnode_t *vp,
 	offset_t ooff,
 	offset_t *noffp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
 	VOPSTATS_UPDATE(vp, seek);
 	return (err);
 }
 
 int
 fop_cmp(
 	vnode_t *vp1,
 	vnode_t *vp2,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
 	VOPSTATS_UPDATE(vp1, cmp);
 	return (err);
 }
 
 int
 fop_frlock(
 	vnode_t *vp,
 	int cmd,
 	flock64_t *bfp,
 	int flag,
 	offset_t offset,
 	struct flk_callback *flk_cbp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_frlock)
 	    (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 	VOPSTATS_UPDATE(vp, frlock);
 	return (err);
 }
 
 int
 fop_space(
 	vnode_t *vp,
 	int cmd,
 	flock64_t *bfp,
 	int flag,
 	offset_t offset,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
 	VOPSTATS_UPDATE(vp, space);
 	return (err);
 }
 
 int
 fop_realvp(
 	vnode_t *vp,
 	vnode_t **vpp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
 	VOPSTATS_UPDATE(vp, realvp);
 	return (err);
 }
 
 int
 fop_getpage(
 	vnode_t *vp,
 	offset_t off,
 	size_t len,
 	uint_t *protp,
 	page_t **plarr,
 	size_t plsz,
 	struct seg *seg,
 	caddr_t addr,
 	enum seg_rw rw,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_getpage)
 	    (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
 	VOPSTATS_UPDATE(vp, getpage);
 	return (err);
 }
 
 int
 fop_putpage(
 	vnode_t *vp,
 	offset_t off,
 	size_t len,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, putpage);
 	return (err);
 }
 
 int
 fop_map(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t *addrp,
 	size_t len,
 	uchar_t prot,
 	uchar_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_map)
 	    (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, map);
 	return (err);
 }
 
 int
 fop_addmap(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t addr,
 	size_t len,
 	uchar_t prot,
 	uchar_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int error;
 	u_longlong_t delta;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	error = (*(vp)->v_op->vop_addmap)
 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
 
 	if ((!error) && (vp->v_type == VREG)) {
 		delta = (u_longlong_t)btopr(len);
 		/*
 		 * If file is declared MAP_PRIVATE, it can't be written back
 		 * even if open for write. Handle as read.
 		 */
 		if (flags & MAP_PRIVATE) {
 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 			    (int64_t)delta);
 		} else {
 			/*
 			 * atomic_add_64 forces the fetch of a 64 bit value to
 			 * be atomic on 32 bit machines
 			 */
 			if (maxprot & PROT_WRITE)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
 				    (int64_t)delta);
 			if (maxprot & PROT_READ)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)delta);
 			if (maxprot & PROT_EXEC)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)delta);
 		}
 	}
 	VOPSTATS_UPDATE(vp, addmap);
 	return (error);
 }
 
 int
 fop_delmap(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t addr,
 	size_t len,
 	uint_t prot,
 	uint_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int error;
 	u_longlong_t delta;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	error = (*(vp)->v_op->vop_delmap)
 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
 
 	/*
 	 * NFS calls into delmap twice, the first time
 	 * it simply establishes a callback mechanism and returns EAGAIN
 	 * while the real work is being done upon the second invocation.
 	 * We have to detect this here and only decrement the counts upon
 	 * the second delmap request.
 	 */
 	if ((error != EAGAIN) && (vp->v_type == VREG)) {
 
 		delta = (u_longlong_t)btopr(len);
 
 		if (flags & MAP_PRIVATE) {
 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 			    (int64_t)(-delta));
 		} else {
 			/*
 			 * atomic_add_64 forces the fetch of a 64 bit value
 			 * to be atomic on 32 bit machines
 			 */
 			if (maxprot & PROT_WRITE)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
 				    (int64_t)(-delta));
 			if (maxprot & PROT_READ)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)(-delta));
 			if (maxprot & PROT_EXEC)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)(-delta));
 		}
 	}
 	VOPSTATS_UPDATE(vp, delmap);
 	return (error);
 }
 
 
 int
 fop_poll(
 	vnode_t *vp,
 	short events,
 	int anyyet,
 	short *reventsp,
 	struct pollhead **phpp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
 	VOPSTATS_UPDATE(vp, poll);
 	return (err);
 }
 
 int
 fop_dump(
 	vnode_t *vp,
 	caddr_t addr,
 	offset_t lbdn,
 	offset_t dblks,
 	caller_context_t *ct)
 {
 	int	err;
 
 	/* ensure lbdn and dblks can be passed safely to bdev_dump */
 	if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
 		return (EIO);
 
 	err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
 	VOPSTATS_UPDATE(vp, dump);
 	return (err);
 }
 
 int
 fop_pathconf(
 	vnode_t *vp,
 	int cmd,
 	ulong_t *valp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
 	VOPSTATS_UPDATE(vp, pathconf);
 	return (err);
 }
 
 int
 fop_pageio(
 	vnode_t *vp,
 	struct page *pp,
 	u_offset_t io_off,
 	size_t io_len,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, pageio);
 	return (err);
 }
 
 int
 fop_dumpctl(
 	vnode_t *vp,
 	int action,
 	offset_t *blkp,
 	caller_context_t *ct)
 {
 	int	err;
 	err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
 	VOPSTATS_UPDATE(vp, dumpctl);
 	return (err);
 }
 
 void
 fop_dispose(
 	vnode_t *vp,
 	page_t *pp,
 	int flag,
 	int dn,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	/* Must do stats first since it's possible to lose the vnode */
 	VOPSTATS_UPDATE(vp, dispose);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	(*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
 }
 
 int
 fop_setsecattr(
 	vnode_t *vp,
 	vsecattr_t *vsap,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flag & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, setsecattr);
 	return (err);
 }
 
 int
 fop_getsecattr(
 	vnode_t *vp,
 	vsecattr_t *vsap,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flag & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, getsecattr);
 	return (err);
 }
 
 int
 fop_shrlock(
 	vnode_t *vp,
 	int cmd,
 	struct shrlock *shr,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, shrlock);
 	return (err);
 }
 
 int
 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
     caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
 	VOPSTATS_UPDATE(vp, vnevent);
 	return (err);
 }
 
 int
 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
     caller_context_t *ct)
 {
 	int err;
 
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
 		return (ENOTSUP);
 	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, reqzcbuf);
 	return (err);
 }
 
 int
 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
 {
 	int err;
 
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
 		return (ENOTSUP);
 	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, retzcbuf);
 	return (err);
 }
 
 /*
  * Default destructor
  *	Needed because NULL destructor means that the key is unused
  */
 /* ARGSUSED */
 void
 vsd_defaultdestructor(void *value)
 {}
 
 /*
  * Create a key (index into per vnode array)
  *	Locks out vsd_create, vsd_destroy, and vsd_free
  *	May allocate memory with lock held
  */
 void
 vsd_create(uint_t *keyp, void (*destructor)(void *))
 {
 	int	i;
 	uint_t	nkeys;
 
 	/*
 	 * if key is allocated, do nothing
 	 */
 	mutex_enter(&vsd_lock);
 	if (*keyp) {
 		mutex_exit(&vsd_lock);
 		return;
 	}
 	/*
 	 * find an unused key
 	 */
 	if (destructor == NULL)
 		destructor = vsd_defaultdestructor;
 
 	for (i = 0; i < vsd_nkeys; ++i)
 		if (vsd_destructor[i] == NULL)
 			break;
 
 	/*
 	 * if no unused keys, increase the size of the destructor array
 	 */
 	if (i == vsd_nkeys) {
 		if ((nkeys = (vsd_nkeys << 1)) == 0)
 			nkeys = 1;
 		vsd_destructor =
 		    (void (**)(void *))vsd_realloc((void *)vsd_destructor,
 		    (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
 		    (size_t)(nkeys * sizeof (void (*)(void *))));
 		vsd_nkeys = nkeys;
 	}
 
 	/*
 	 * allocate the next available unused key
 	 */
 	vsd_destructor[i] = destructor;
 	*keyp = i + 1;
 
 	/* create vsd_list, if it doesn't exist */
 	if (vsd_list == NULL) {
 		vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 		list_create(vsd_list, sizeof (struct vsd_node),
 		    offsetof(struct vsd_node, vs_nodes));
 	}
 
 	mutex_exit(&vsd_lock);
 }
 
 /*
  * Destroy a key
  *
  * Assumes that the caller is preventing vsd_set and vsd_get
  * Locks out vsd_create, vsd_destroy, and vsd_free
  * May free memory with lock held
  */
 void
 vsd_destroy(uint_t *keyp)
 {
 	uint_t key;
 	struct vsd_node *vsd;
 
 	/*
 	 * protect the key namespace and our destructor lists
 	 */
 	mutex_enter(&vsd_lock);
 	key = *keyp;
 	*keyp = 0;
 
 	ASSERT(key <= vsd_nkeys);
 
 	/*
 	 * if the key is valid
 	 */
 	if (key != 0) {
 		uint_t k = key - 1;
 		/*
 		 * for every vnode with VSD, call key's destructor
 		 */
 		for (vsd = list_head(vsd_list); vsd != NULL;
 		    vsd = list_next(vsd_list, vsd)) {
 			/*
 			 * no VSD for key in this vnode
 			 */
 			if (key > vsd->vs_nkeys)
 				continue;
 			/*
 			 * call destructor for key
 			 */
 			if (vsd->vs_value[k] && vsd_destructor[k])
 				(*vsd_destructor[k])(vsd->vs_value[k]);
 			/*
 			 * reset value for key
 			 */
 			vsd->vs_value[k] = NULL;
 		}
 		/*
 		 * actually free the key (NULL destructor == unused)
 		 */
 		vsd_destructor[k] = NULL;
 	}
 
 	mutex_exit(&vsd_lock);
 }
 
 /*
  * Quickly return the per vnode value that was stored with the specified key
  * Assumes the caller is protecting key from vsd_create and vsd_destroy
  * Assumes the caller is holding v_vsd_lock to protect the vsd.
  */
 void *
 vsd_get(vnode_t *vp, uint_t key)
 {
 	struct vsd_node *vsd;
 
 	ASSERT(vp != NULL);
 	ASSERT(mutex_owned(&vp->v_vsd_lock));
 
 	vsd = vp->v_vsd;
 
 	if (key && vsd != NULL && key <= vsd->vs_nkeys)
 		return (vsd->vs_value[key - 1]);
 	return (NULL);
 }
 
 /*
  * Set a per vnode value indexed with the specified key
  * Assumes the caller is holding v_vsd_lock to protect the vsd.
  */
 int
 vsd_set(vnode_t *vp, uint_t key, void *value)
 {
 	struct vsd_node *vsd;
 
 	ASSERT(vp != NULL);
 	ASSERT(mutex_owned(&vp->v_vsd_lock));
 
 	if (key == 0)
 		return (EINVAL);
 
 	vsd = vp->v_vsd;
 	if (vsd == NULL)
 		vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
 
 	/*
 	 * If the vsd was just allocated, vs_nkeys will be 0, so the following
 	 * code won't happen and we will continue down and allocate space for
 	 * the vs_value array.
 	 * If the caller is replacing one value with another, then it is up
 	 * to the caller to free/rele/destroy the previous value (if needed).
 	 */
 	if (key <= vsd->vs_nkeys) {
 		vsd->vs_value[key - 1] = value;
 		return (0);
 	}
 
 	ASSERT(key <= vsd_nkeys);
 
 	if (vsd->vs_nkeys == 0) {
 		mutex_enter(&vsd_lock);	/* lock out vsd_destroy() */
 		/*
 		 * Link onto list of all VSD nodes.
 		 */
 		list_insert_head(vsd_list, vsd);
 		mutex_exit(&vsd_lock);
 	}
 
 	/*
 	 * Allocate vnode local storage and set the value for key
 	 */
 	vsd->vs_value = vsd_realloc(vsd->vs_value,
 	    vsd->vs_nkeys * sizeof (void *),
 	    key * sizeof (void *));
 	vsd->vs_nkeys = key;
 	vsd->vs_value[key - 1] = value;
 
 	return (0);
 }
 
 /*
  * Called from vn_free() to run the destructor function for each vsd
  *	Locks out vsd_create and vsd_destroy
  *	Assumes that the destructor *DOES NOT* use vsd
  */
 void
 vsd_free(vnode_t *vp)
 {
 	int i;
 	struct vsd_node *vsd = vp->v_vsd;
 
 	if (vsd == NULL)
 		return;
 
 	if (vsd->vs_nkeys == 0) {
 		kmem_free(vsd, sizeof (*vsd));
 		vp->v_vsd = NULL;
 		return;
 	}
 
 	/*
 	 * lock out vsd_create and vsd_destroy, call
 	 * the destructor, and mark the value as destroyed.
 	 */
 	mutex_enter(&vsd_lock);
 
 	for (i = 0; i < vsd->vs_nkeys; i++) {
 		if (vsd->vs_value[i] && vsd_destructor[i])
 			(*vsd_destructor[i])(vsd->vs_value[i]);
 		vsd->vs_value[i] = NULL;
 	}
 
 	/*
 	 * remove from linked list of VSD nodes
 	 */
 	list_remove(vsd_list, vsd);
 
 	mutex_exit(&vsd_lock);
 
 	/*
 	 * free up the VSD
 	 */
 	kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
 	kmem_free(vsd, sizeof (struct vsd_node));
 	vp->v_vsd = NULL;
 }
 
 /*
  * realloc
  */
 static void *
 vsd_realloc(void *old, size_t osize, size_t nsize)
 {
 	void *new;
 
 	new = kmem_zalloc(nsize, KM_SLEEP);
 	if (old) {
 		bcopy(old, new, osize);
 		kmem_free(old, osize);
 	}
 	return (new);
 }
 
 /*
  * Setup the extensible system attribute for creating a reparse point.
  * The symlink data 'target' is validated for proper format of a reparse
  * string and a check also made to make sure the symlink data does not
  * point to an existing file.
  *
  * return 0 if ok else -1.
  */
 static int
 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
 {
 	xoptattr_t *xoap;
 
 	if ((!target) || (!vap) || (!xvattr))
 		return (-1);
 
 	/* validate reparse string */
 	if (reparse_validate((const char *)target))
 		return (-1);
 
 	xva_init(xvattr);
 	xvattr->xva_vattr = *vap;
 	xvattr->xva_vattr.va_mask |= AT_XVATTR;
 	xoap = xva_getxoptattr(xvattr);
 	ASSERT(xoap);
 	XVA_SET_REQ(xvattr, XAT_REPARSE);
 	xoap->xoa_reparse = 1;
 
 	return (0);
 }
 
 /*
  * Function to check whether a symlink is a reparse point.
  * Return B_TRUE if it is a reparse point, else return B_FALSE
  */
 boolean_t
 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	xvattr_t xvattr;
 	xoptattr_t *xoap;
 
 	if ((vp->v_type != VLNK) ||
 	    !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
 		return (B_FALSE);
 
 	xva_init(&xvattr);
 	xoap = xva_getxoptattr(&xvattr);
 	ASSERT(xoap);
 	XVA_SET_REQ(&xvattr, XAT_REPARSE);
 
 	if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
 		return (B_FALSE);
 
 	if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
 	    (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
 		return (B_FALSE);
 
 	return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ctldir.c	(revision 318933)
@@ -1,1361 +1,1361 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' directory, but this may expand in the
  * future.  The elements are built using the GFS primitives, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  * 	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding vnode.
  *
  * All mounts are handled automatically by the kernel, but unmounts are
  * (currently) handled from user land.  The main reason is that there is no
  * reliable way to auto-unmount the filesystem when it's "no longer in use".
  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
  * unmounts any snapshots within the snapshot directory.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
  *
  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
  * However, vnodes within these mounted on file systems have their v_vfsp
  * fields set to the head filesystem to make NFS happy (see
  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
  * so that it cannot be freed until all snapshots have been unmounted.
  */
 
 #include <fs/fs_subr.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/vfs_opreg.h>
 #include <sys/gfs.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/mount.h>
 #include <sys/sunddi.h>
 
 #include "zfs_namecheck.h"
 
 typedef struct zfsctl_node {
 	gfs_dir_t	zc_gfs_private;
 	uint64_t	zc_id;
 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
 } zfsctl_node_t;
 
 typedef struct zfsctl_snapdir {
 	zfsctl_node_t	sd_node;
 	kmutex_t	sd_lock;
 	avl_tree_t	sd_snaps;
 } zfsctl_snapdir_t;
 
 typedef struct {
 	char		*se_name;
 	vnode_t		*se_root;
 	avl_node_t	se_node;
 } zfs_snapentry_t;
 
 static int
 snapentry_compare(const void *a, const void *b)
 {
 	const zfs_snapentry_t *sa = a;
 	const zfs_snapentry_t *sb = b;
 	int ret = strcmp(sa->se_name, sb->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 vnodeops_t *zfsctl_ops_root;
 vnodeops_t *zfsctl_ops_snapdir;
 vnodeops_t *zfsctl_ops_snapshot;
 vnodeops_t *zfsctl_ops_shares;
 
 static const fs_operation_def_t zfsctl_tops_root[];
 static const fs_operation_def_t zfsctl_tops_snapdir[];
 static const fs_operation_def_t zfsctl_tops_snapshot[];
 static const fs_operation_def_t zfsctl_tops_shares[];
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
 static vnode_t *zfsctl_mknode_shares(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
 
 static gfs_opsvec_t zfsctl_opsvec[] = {
 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
 	{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares },
 	{ NULL }
 };
 
 /*
  * Root directory elements.  We only have two entries
  * snapshot and shares.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
 	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
 /* include . and .. in the calculation */
 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
     sizeof (gfs_dirent_t)) + 1)
 
 
 /*
  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
  * directories.  This is called from the ZFS init routine, and initializes the
  * vnode ops vectors that we'll be using.
  */
 void
 zfsctl_init(void)
 {
 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
 }
 
 void
 zfsctl_fini(void)
 {
 	/*
 	 * Remove vfsctl vnode ops
 	 */
 	if (zfsctl_ops_root)
 		vn_freevnodeops(zfsctl_ops_root);
 	if (zfsctl_ops_snapdir)
 		vn_freevnodeops(zfsctl_ops_snapdir);
 	if (zfsctl_ops_snapshot)
 		vn_freevnodeops(zfsctl_ops_snapshot);
 	if (zfsctl_ops_shares)
 		vn_freevnodeops(zfsctl_ops_shares);
 
 	zfsctl_ops_root = NULL;
 	zfsctl_ops_snapdir = NULL;
 	zfsctl_ops_snapshot = NULL;
 	zfsctl_ops_shares = NULL;
 }
 
 boolean_t
 zfsctl_is_node(vnode_t *vp)
 {
 	return (vn_matchops(vp, zfsctl_ops_root) ||
 	    vn_matchops(vp, zfsctl_ops_snapdir) ||
 	    vn_matchops(vp, zfsctl_ops_snapshot) ||
 	    vn_matchops(vp, zfsctl_ops_shares));
 
 }
 
 /*
  * Return the inode number associated with the 'snapshot' or
  * 'shares' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 
 	ASSERT(index < 2);
 
 	if (index == 0)
 		return (ZFSCTL_INO_SNAPDIR);
 
 	return (zfsvfs->z_shares_dir);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.
  */
 void
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	vnode_t *vp, *rvp;
 	zfsctl_node_t *zcp;
 	uint64_t crtime[2];
 
 	ASSERT(zfsvfs->z_ctldir == NULL);
 
 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = ZFSCTL_INO_ROOT;
 
 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
 	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 	    &crtime, sizeof (crtime)));
 	ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
 	VN_RELE(rvp);
 
 	/*
 	 * We're only faking the fact that we have a root of a filesystem for
 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
 	 * for us.
 	 */
 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
 
 	zfsvfs->z_ctldir = vp;
 }
 
 /*
  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
  * There might still be more references if we were force unmounted, but only
  * new zfs_inactive() calls can occur and they don't reference .zfs
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	VN_RELE(zfsvfs->z_ctldir);
 	zfsvfs->z_ctldir = NULL;
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 vnode_t *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
 	return (zp->z_zfsvfs->z_ctldir);
 }
 
 /*
  * Common open routine.  Disallow any write access.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
 {
 	if (flags & FWRITE)
 		return (SET_ERROR(EACCES));
 
 	return (0);
 }
 
 /*
  * Common close routine.  Nothing to do here.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
     cred_t *cr, caller_context_t *ct)
 {
 	return (0);
 }
 
 /*
  * Common access routine.  Disallow writes.
  */
 /* ARGSUSED */
 static int
 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	if (flags & V_ACE_MASK) {
 		if (mode & ACE_ALL_WRITE_PERMS)
 			return (SET_ERROR(EACCES));
 	} else {
 		if (mode & VWRITE)
 			return (SET_ERROR(EACCES));
 	}
 
 	return (0);
 }
 
 /*
  * Common getattr function.  Fill in basic information.
  */
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
 	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
 	vap->va_nblocks = 0;
 	vap->va_seq = 0;
 	vap->va_fsid = vp->v_vfsp->vfs_dev;
 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
 	    S_IROTH | S_IXOTH;
 	vap->va_type = VDIR;
 	/*
 	 * We live in the now (for atime).
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
 }
 
 /*ARGSUSED*/
 static int
 zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t	*zcp = vp->v_data;
 	uint64_t	object = zcp->zc_id;
 	zfid_short_t	*zfid;
 	int		i;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 
 /*ARGSUSED*/
 static int
 zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t		*dzp;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_shares_dir == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = VOP_FID(ZTOV(dzp), fidp, ct);
 		VN_RELE(ZTOV(dzp));
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 /*
  * .zfs inode namespace
  *
  * We need to generate unique inode numbers for all files and directories
  * within the .zfs pseudo-filesystem.  We use the following scheme:
  *
  * 	ENTRY			ZFSCTL_INODE
  * 	.zfs			1
  * 	.zfs/snapshot		2
  * 	.zfs/snapshot/<snap>	objectid(snap)
  */
 
 #define	ZFSCTL_INO_SNAP(id)	(id)
 
 /*
  * Get root directory attributes.
  */
 /* ARGSUSED */
 static int
 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_node_t *zcp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Special case the handling of "..".
  */
 /* ARGSUSED */
 int
 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	/*
 	 * No extended attributes allowed under .zfs
 	 */
 	if (flags & LOOKUP_XATTR)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 
 	if (strcmp(nm, "..") == 0) {
 		err = VFS_ROOT(dvp->v_vfsp, vpp);
 	} else {
 		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
 		    cr, ct, direntflags, realpnp);
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (err);
 }
 
 static int
 zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	/*
 	 * We only care about ACL_ENABLED so that libsec can
 	 * display ACL correctly and not default to POSIX draft.
 	 */
 	if (cmd == _PC_ACL_ENABLED) {
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 	}
 
 	return (fs_pathconf(vp, cmd, valp, cr, ct));
 }
 
 static const fs_operation_def_t zfsctl_tops_root[] = {
 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
 	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
 	{ NULL }
 };
 
 /*
  * Gets the full dataset name that corresponds to the given snapshot name
  * Example:
  * 	zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
  */
 static int
 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
 {
 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
 
 	if (zfs_component_namecheck(name, NULL, NULL) != 0)
 		return (SET_ERROR(EILSEQ));
 	dmu_objset_name(os, zname);
 	if (strlen(zname) + 1 + strlen(name) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strcat(zname, "@");
 	(void) strcat(zname, name);
 	return (0);
 }
 
 static int
 zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
 {
 	vnode_t *svp = sep->se_root;
 	int error;
 
 	ASSERT(vn_ismntpt(svp));
 
 	/* this will be dropped by dounmount() */
 	if ((error = vn_vfswlock(svp)) != 0)
 		return (error);
 
 	VN_HOLD(svp);
 	error = dounmount(vn_mountedvfs(svp), fflags, cr);
 	if (error) {
 		VN_RELE(svp);
 		return (error);
 	}
 
 	/*
 	 * We can't use VN_RELE(), as that will try to invoke
 	 * zfsctl_snapdir_inactive(), which would cause us to destroy
 	 * the sd_lock mutex held by our caller.
 	 */
 	ASSERT(svp->v_count == 1);
 	gfs_vop_inactive(svp, cr, NULL);
 
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	kmem_free(sep, sizeof (zfs_snapentry_t));
 
 	return (0);
 }
 
 static void
 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
 {
 	avl_index_t where;
 	vfs_t *vfsp;
 	refstr_t *pathref;
 	char newpath[MAXNAMELEN];
 	char *tail;
 
 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
 	ASSERT(sep != NULL);
 
 	vfsp = vn_mountedvfs(sep->se_root);
 	ASSERT(vfsp != NULL);
 
 	vfs_lock_wait(vfsp);
 
 	/*
 	 * Change the name in the AVL tree.
 	 */
 	avl_remove(&sdp->sd_snaps, sep);
 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	/*
 	 * Change the current mountpoint info:
 	 * 	- update the tail of the mntpoint path
 	 *	- update the tail of the resource path
 	 */
 	pathref = vfs_getmntpoint(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setmntpoint(vfsp, newpath, 0);
 
 	pathref = vfs_getresource(vfsp);
 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
 	*(tail+1) = '\0';
 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
 	(void) strcat(newpath, nm);
 	refstr_rele(pathref);
 	vfs_setresource(vfsp, newpath, 0);
 
 	vfs_unlock(vfsp);
 }
 
 /*ARGSUSED*/
 static int
 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
     cred_t *cr, caller_context_t *ct, int flags)
 {
 	zfsctl_snapdir_t *sdp = sdvp->v_data;
 	zfs_snapentry_t search, *sep;
 	zfsvfs_t *zfsvfs;
 	avl_index_t where;
 	char from[ZFS_MAX_DATASET_NAME_LEN], to[ZFS_MAX_DATASET_NAME_LEN];
 	char real[ZFS_MAX_DATASET_NAME_LEN], fsname[ZFS_MAX_DATASET_NAME_LEN];
 	int err;
 
 	zfsvfs = sdvp->v_vfsp->vfs_data;
 	ZFS_ENTER(zfsvfs);
 
 	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
 		    sizeof (real), NULL);
 		if (err == 0) {
 			snm = real;
 		} else if (err != ENOTSUP) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	dmu_objset_name(zfsvfs->z_os, fsname);
 
 	err = zfsctl_snapshot_zname(sdvp, snm, sizeof (from), from);
 	if (err == 0)
 		err = zfsctl_snapshot_zname(tdvp, tnm, sizeof (to), to);
 	if (err == 0)
 		err = zfs_secpolicy_rename_perms(from, to, cr);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdvp != tdvp)
 		return (SET_ERROR(EINVAL));
 
 	if (strcmp(snm, tnm) == 0)
 		return (0);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = (char *)snm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
 		mutex_exit(&sdp->sd_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	err = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
 	if (err == 0)
 		zfsctl_rename_snap(sdp, sep, tnm);
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	zfs_snapentry_t *sep;
 	zfs_snapentry_t search;
 	zfsvfs_t *zfsvfs;
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	char real[ZFS_MAX_DATASET_NAME_LEN];
 	int err;
 
 	zfsvfs = dvp->v_vfsp->vfs_data;
 	ZFS_ENTER(zfsvfs);
 
 	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 
 		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
 		    sizeof (real), NULL);
 		if (err == 0) {
 			name = real;
 		} else if (err != ENOTSUP) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	err = zfsctl_snapshot_zname(dvp, name, sizeof (snapname), snapname);
 	if (err == 0)
 		err = zfs_secpolicy_destroy_perms(snapname, cr);
 	if (err != 0)
 		return (err);
 
 	mutex_enter(&sdp->sd_lock);
 
 	search.se_name = name;
 	sep = avl_find(&sdp->sd_snaps, &search, NULL);
 	if (sep) {
 		avl_remove(&sdp->sd_snaps, sep);
 		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
 		if (err != 0)
 			avl_add(&sdp->sd_snaps, sep);
 		else
 			err = dsl_destroy_snapshot(snapname, B_FALSE);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 
 	mutex_exit(&sdp->sd_lock);
 
 	return (err);
 }
 
 /*
  * This creates a snapshot under '.zfs/snapshot'.
  */
 /* ARGSUSED */
 static int
 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
     cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	int err;
 	static enum symfollow follow = NO_FOLLOW;
 	static enum uio_seg seg = UIO_SYSSPACE;
 
 	if (zfs_component_namecheck(dirname, NULL, NULL) != 0)
 		return (SET_ERROR(EILSEQ));
 
 	dmu_objset_name(zfsvfs->z_os, name);
 
 	*vpp = NULL;
 
 	err = zfs_secpolicy_snapshot_perms(name, cr);
 	if (err != 0)
 		return (err);
 
 	if (err == 0) {
 		err = dmu_objset_snapshot_one(name, dirname);
 		if (err != 0)
 			return (err);
 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
 	}
 
 	return (err);
 }
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
  * Perform a mount of the associated dataset on top of the vnode.
  */
 /* ARGSUSED */
 static int
 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	zfsctl_snapdir_t *sdp = dvp->v_data;
 	objset_t *snap;
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	char real[ZFS_MAX_DATASET_NAME_LEN];
 	char *mountpoint;
 	zfs_snapentry_t *sep, search;
 	struct mounta margs;
 	vfs_t *vfsp;
 	size_t mountpoint_len;
 	avl_index_t where;
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	int err;
 
 	/*
 	 * No extended attributes allowed under .zfs
 	 */
 	if (flags & LOOKUP_XATTR)
 		return (SET_ERROR(EINVAL));
 
 	ASSERT(dvp->v_type == VDIR);
 
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
 	 * spec (which looks like a local path for zfs).  We need to
 	 * add some flag to domount() to tell it not to do this lookup.
 	 */
 	if (MUTEX_HELD(&sdp->sd_lock))
 		return (SET_ERROR(ENOENT));
 
 	ZFS_ENTER(zfsvfs);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	if (flags & FIGNORECASE) {
 		boolean_t conflict = B_FALSE;
 
 		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
 		    sizeof (real), &conflict);
 		if (err == 0) {
 			nm = real;
 		} else if (err != ENOTSUP) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		if (realpnp)
 			(void) strlcpy(realpnp->pn_buf, nm,
 			    realpnp->pn_bufsize);
 		if (conflict && direntflags)
 			*direntflags = ED_CASE_CONFLICT;
 	}
 
 	mutex_enter(&sdp->sd_lock);
 	search.se_name = (char *)nm;
 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
 		*vpp = sep->se_root;
 		VN_HOLD(*vpp);
 		err = traverse(vpp);
 		if (err != 0) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 		} else if (*vpp == sep->se_root) {
 			/*
 			 * The snapshot was unmounted behind our backs,
 			 * try to remount it.
 			 */
 			goto domount;
 		} else {
 			/*
 			 * VROOT was set during the traverse call.  We need
 			 * to clear it since we're pretending to be part
 			 * of our parent's vfs.
 			 */
 			(*vpp)->v_flag &= ~VROOT;
 		}
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 
 	/*
 	 * The requested snapshot is not currently mounted, look it up.
 	 */
 	err = zfsctl_snapshot_zname(dvp, nm, sizeof (snapname), snapname);
 	if (err != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		/*
 		 * handle "ls *" or "?" in a graceful manner,
 		 * forcing EILSEQ to ENOENT.
 		 * Since shell ultimately passes "*" or "?" as name to lookup
 		 */
 		return (err == EILSEQ ? ENOENT : err);
 	}
 	if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOENT));
 	}
 
 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
 	(void) strcpy(sep->se_name, nm);
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	avl_insert(&sdp->sd_snaps, sep, where);
 
 	dmu_objset_rele(snap, FTAG);
 domount:
 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
 
 	margs.spec = snapname;
 	margs.dir = mountpoint;
 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
 	margs.fstype = "zfs";
 	margs.dataptr = NULL;
 	margs.datalen = 0;
 	margs.optptr = NULL;
 	margs.optlen = 0;
 
 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
 	kmem_free(mountpoint, mountpoint_len);
 
 	if (err == 0) {
 		/*
 		 * Return the mounted root rather than the covered mount point.
 		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
 		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
 		 * vnode is the root of the newly created vfsp.
 		 */
 		VFS_RELE(vfsp);
 		err = traverse(vpp);
 	}
 
 	if (err == 0) {
 		/*
 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
 		 *
 		 * This is where we lie about our v_vfsp in order to
 		 * make .zfs/snapshot/<snapname> accessible over NFS
 		 * without requiring manual mounts of <snapname>.
 		 */
 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
 		(*vpp)->v_flag &= ~VROOT;
 	}
 	mutex_exit(&sdp->sd_lock);
 	ZFS_EXIT(zfsvfs);
 
 	/*
 	 * If we had an error, drop our hold on the vnode and
 	 * zfsctl_snapshot_inactive() will clean up.
 	 */
 	if (err != 0) {
 		VN_RELE(*vpp);
 		*vpp = NULL;
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
 	znode_t *dzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	if (zfsvfs->z_shares_dir == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
 		    flags, rdir, cr, ct, direntflags, realpnp);
 		VN_RELE(ZTOV(dzp));
 	}
 
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
     offset_t *offp, offset_t *nextp, void *data, int flags)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	char snapname[ZFS_MAX_DATASET_NAME_LEN];
 	uint64_t id, cookie;
 	boolean_t case_conflict;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	cookie = *offp;
 	dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 	error = dmu_snapshot_list_next(zfsvfs->z_os,
 	    sizeof (snapname), snapname, &id, &cookie, &case_conflict);
 	dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		if (error == ENOENT) {
 			*eofp = 1;
 			return (0);
 		}
 		return (error);
 	}
 
 	if (flags & V_RDDIR_ENTFLAGS) {
 		edirent_t *eodp = dp;
 
 		(void) strcpy(eodp->ed_name, snapname);
 		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
 		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
 	} else {
 		struct dirent64 *odp = dp;
 
 		(void) strcpy(odp->d_name, snapname);
 		odp->d_ino = ZFSCTL_INO_SNAP(id);
 	}
 	*nextp = cookie;
 
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
     caller_context_t *ct, int flags)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *dzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (zfsvfs->z_shares_dir == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
 		VN_RELE(ZTOV(dzp));
 	} else {
 		*eofp = 1;
 		error = SET_ERROR(ENOENT);
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * pvp is the '.zfs' directory (zfsctl_node_t).
  *
  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
  *
  * This function is the callback to create a GFS vnode for '.zfs/snapshot'
  * when a lookup is performed on .zfs for "snapshot".
  */
 vnode_t *
 zfsctl_mknode_snapdir(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_snapdir_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
 	    zfsctl_snapdir_readdir_cb, NULL);
 	sdp = vp->v_data;
 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&sdp->sd_snaps, snapentry_compare,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
 	return (vp);
 }
 
 vnode_t *
 zfsctl_mknode_shares(vnode_t *pvp)
 {
 	vnode_t *vp;
 	zfsctl_node_t *sdp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
 	    zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
 	    NULL, NULL);
 	sdp = vp->v_data;
 	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
 	return (vp);
 
 }
 
 /* ARGSUSED */
 static int
 zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *dzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	if (zfsvfs->z_shares_dir == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTSUP));
 	}
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
 		VN_RELE(ZTOV(dzp));
 	}
 	ZFS_EXIT(zfsvfs);
 	return (error);
 
 
 }
 
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	zfsctl_snapdir_t *sdp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
 	vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	zfsctl_snapdir_t *sdp = vp->v_data;
 	void *private;
 
 	private = gfs_dir_inactive(vp);
 	if (private != NULL) {
 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
 		mutex_destroy(&sdp->sd_lock);
 		avl_destroy(&sdp->sd_snaps);
 		kmem_free(private, sizeof (zfsctl_snapdir_t));
 	}
 }
 
 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
 	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
 	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
 	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
 	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
 	{ NULL }
 };
 
 static const fs_operation_def_t zfsctl_tops_shares[] = {
 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_shares_getattr } },
 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
 	{ VOPNAME_READDIR,	{ .vop_readdir = zfsctl_shares_readdir } },
 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_shares_lookup }	},
 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive } },
 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_shares_fid } },
 	{ NULL }
 };
 
 /*
  * pvp is the GFS vnode '.zfs/snapshot'.
  *
  * This creates a GFS node under '.zfs/snapshot' representing each
  * snapshot.  This newly created GFS node is what we mount snapshot
  * vfs_t's ontop of.
  */
 static vnode_t *
 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 {
 	vnode_t *vp;
 	zfsctl_node_t *zcp;
 
 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
 
 	return (vp);
 }
 
 static void
 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	vnode_t *dvp;
 
 	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count > 1) {
-		vp->v_count--;
+		VN_RELE_LOCKED(vp);
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&sdp->sd_lock);
 		VN_RELE(dvp);
 		return;
 	}
 	mutex_exit(&vp->v_lock);
 	ASSERT(!vn_ismntpt(vp));
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		if (sep->se_root == vp) {
 			avl_remove(&sdp->sd_snaps, sep);
 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
 			kmem_free(sep, sizeof (zfs_snapentry_t));
 			break;
 		}
 		sep = next;
 	}
 	ASSERT(sep != NULL);
 
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
 	 * This is safe to do because once this entry has been removed
 	 * from the AVL tree, it can't be found again, so cannot become
 	 * "active".  If we lookup the same name again we will end up
 	 * creating a new vnode.
 	 */
 	gfs_vop_inactive(vp, cr, ct);
 }
 
 
 /*
  * These VP's should never see the light of day.  They should always
  * be covered.
  */
 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
 	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
 	NULL, NULL
 };
 
 int
 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp, *vp;
 	zfsctl_snapdir_t *sdp;
 	zfsctl_node_t *zcp;
 	zfs_snapentry_t *sep;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		vp = sep->se_root;
 		zcp = vp->v_data;
 		if (zcp->zc_id == objsetid)
 			break;
 
 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
 	}
 
 	if (sep != NULL) {
 		VN_HOLD(vp);
 		/*
 		 * Return the mounted root rather than the covered mount point.
 		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
 		 * and returns the ZFS vnode mounted on top of the GFS node.
 		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
 		 */
 		error = traverse(&vp);
 		if (error == 0) {
 			if (vp == sep->se_root)
 				error = SET_ERROR(EINVAL);
 			else
 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
 		}
 		mutex_exit(&sdp->sd_lock);
 		VN_RELE(vp);
 	} else {
 		error = SET_ERROR(EINVAL);
 		mutex_exit(&sdp->sd_lock);
 	}
 
 	VN_RELE(dvp);
 
 	return (error);
 }
 
 /*
  * Unmount any snapshots for the given filesystem.  This is called from
  * zfs_umount() - if we have a ctldir, then go through and unmount all the
  * snapshots.
  */
 int
 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	vnode_t *dvp;
 	zfsctl_snapdir_t *sdp;
 	zfs_snapentry_t *sep, *next;
 	int error;
 
 	ASSERT(zfsvfs->z_ctldir != NULL);
 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
 	    NULL, 0, NULL, cr, NULL, NULL, NULL);
 	if (error != 0)
 		return (error);
 	sdp = dvp->v_data;
 
 	mutex_enter(&sdp->sd_lock);
 
 	sep = avl_first(&sdp->sd_snaps);
 	while (sep != NULL) {
 		next = AVL_NEXT(&sdp->sd_snaps, sep);
 
 		/*
 		 * If this snapshot is not mounted, then it must
 		 * have just been unmounted by somebody else, and
 		 * will be cleaned up by zfsctl_snapdir_inactive().
 		 */
 		if (vn_ismntpt(sep->se_root)) {
 			avl_remove(&sdp->sd_snaps, sep);
 			error = zfsctl_unmount_snap(sep, fflags, cr);
 			if (error) {
 				avl_add(&sdp->sd_snaps, sep);
 				break;
 			}
 		}
 		sep = next;
 	}
 
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
 
 	return (error);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	(revision 318933)
@@ -1,5383 +1,5383 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2015 Joyent, Inc.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/vmsystm.h>
 #include <sys/atomic.h>
 #include <sys/vm.h>
 #include <vm/seg_vn.h>
 #include <vm/pvn.h>
 #include <vm/as.h>
 #include <vm/kpm.h>
 #include <vm/seg_kpm.h>
 #include <sys/mman.h>
 #include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/dirent.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include "fs/fs_subr.h"
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
 #include <sys/cred.h>
 #include <sys/attr.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
  *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		ZFS_EXIT(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 		if (fs_vscan(*vpp, cr, 0) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EACCES));
 		}
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & (FSYNC | FDSYNC))
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
 	cleanlocks(vp, ddi_get_pid(), 0);
 	cleanshares(vp, ddi_get_pid());
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 		VERIFY(fs_vscan(vp, cr, 1) == 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 {
 	znode_t	*zp = VTOZ(vp);
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == _FIO_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
     int *rvalp, caller_context_t *ct)
 {
 	offset_t off;
 	offset_t ndata;
 	dmu_object_info_t doi;
 	int error;
 	zfsvfs_t *zfsvfs;
 	znode_t *zp;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (zfs_sync(vp->v_vfsp, 0, cred));
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case _FIO_SEEK_DATA:
 	case _FIO_SEEK_HOLE:
 	{
 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/* offset parameter is in/out */
 		error = zfs_holey(vp, com, &off);
 		ZFS_EXIT(zfsvfs);
 		if (error)
 			return (error);
 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 	case _FIO_COUNT_FILLED:
 	{
 		/*
 		 * _FIO_COUNT_FILLED adds a new ioctl command which
 		 * exposes the number of filled blocks in a
 		 * ZFS object.
 		 */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 
 		/*
 		 * Wait for all dirty blocks for this object
 		 * to get synced out to disk, and the DMU info
 		 * updated.
 		 */
 		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Retrieve fill count from DMU object.
 		 */
 		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 		if (error) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		ndata = doi.doi_fill_count;
 
 		ZFS_EXIT(zfsvfs);
 		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 			return (SET_ERROR(EFAULT));
 		return (0);
 	}
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 /*
  * Utility functions to map and unmap a single physical page.  These
  * are used to manage the mappable copies of ZFS file data, and therefore
  * do not update ref/mod bits.
  */
 caddr_t
 zfs_map_page(page_t *pp, enum seg_rw rw)
 {
 	if (kpm_enable)
 		return (hat_kpm_mapin(pp, 0));
 	ASSERT(rw == S_READ || rw == S_WRITE);
 	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 	    (caddr_t)-1));
 }
 
 void
 zfs_unmap_page(page_t *pp, caddr_t addr)
 {
 	if (kpm_enable) {
 		hat_kpm_mapout(pp, 0, addr);
 	} else {
 		ppmapout(addr);
 	}
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 static void
 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 {
 	int64_t	off;
 
 	off = start & PAGEOFFSET;
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		page_t *pp;
 		uint64_t nbytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_lookup(vp, start, SE_SHARED)) {
 			caddr_t va;
 
 			va = zfs_map_page(pp, S_WRITE);
 			(void) dmu_read(os, oid, start+off, nbytes, va+off,
 			    DMU_READ_PREFETCH);
 			zfs_unmap_page(pp, va);
 			page_unlock(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 static int
 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 {
 	znode_t *zp = VTOZ(vp);
 	int64_t	start, off;
 	int len = nbytes;
 	int error = 0;
 
 	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		page_t *pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if (pp = page_lookup(vp, start, SE_SHARED)) {
 			caddr_t va;
 
 			va = zfs_map_page(pp, S_READ);
 			error = uiomove(va + off, bytes, UIO_READ, uio);
 			zfs_unmap_page(pp, va);
 			page_unlock(pp);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	vp	- vnode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	vp - atime updated if byte count > 0
  */
 /* ARGSUSED */
 static int
 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	ssize_t		n, nbytes;
 	int		error = 0;
 	rl_t		*rl;
 	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (uio->uio_loffset < (offset_t)0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (uio->uio_resid == 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Check for mandatory locks
 	 */
 	if (MANDMODE(zp->z_mode)) {
 		if (error = chklock(vp, FREAD,
 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 */
 	if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (uio->uio_loffset >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(uio->uio_loffset < zp->z_size);
 	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 		int nblk;
 		int blksz = zp->z_blksz;
 		uint64_t offset = uio->uio_loffset;
 
 		xuio = (xuio_t *)uio;
 		if ((ISP2(blksz))) {
 			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 			    blksz)) / blksz;
 		} else {
 			ASSERT(offset + n <= blksz);
 			nblk = 1;
 		}
 		(void) dmu_xuio_init(xuio, nblk);
 
 		if (vn_has_cached_data(vp)) {
 			/*
 			 * For simplicity, we always allocate a full buffer
 			 * even if we only expect to read a portion of a block.
 			 */
 			while (--nblk >= 0) {
 				(void) dmu_xuio_add(xuio,
 				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    blksz), 0, blksz);
 			}
 		}
 	}
 
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 
 		if (vn_has_cached_data(vp)) {
 			error = mappedread(vp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 
 		n -= nbytes;
 	}
 out:
 	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	vp	- vnode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
  *			  set if in append mode.
  *		cr	- credentials of caller.
  *		ct	- caller context (NFS/CIFS fem monitor only)
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated if byte count > 0
  */
 
 /* ARGSUSED */
 static int
 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	rlim64_t	limit = uio->uio_llimit;
 	ssize_t		start_resid = uio->uio_resid;
 	ssize_t		tx_bytes;
 	uint64_t	end_size;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	offset_t	woff;
 	ssize_t		n, nbytes;
 	rl_t		*rl;
 	int		max_blksz = zfsvfs->z_max_blksz;
 	int		error = 0;
 	arc_buf_t	*abuf;
 	iovec_t		*aiov = NULL;
 	xuio_t		*xuio = NULL;
 	int		i_iov = 0;
 	int		iovcnt = uio->uio_iovcnt;
 	iovec_t		*iovp = uio->uio_iov;
 	int		write_eof;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[4];
 	uint64_t	mtime[2], ctime[2];
 
 	/*
 	 * Fasttrack empty write
 	 */
 	n = start_resid;
 	if (n == 0)
 		return (0);
 
 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 		limit = MAXOFFSET_T;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM
 	 */
 	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 	    (uio->uio_loffset < zp->z_size))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Validate file offset
 	 */
 	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 	if (woff < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Check for mandatory locks before calling zfs_range_lock()
 	 * in order to prevent a deadlock with locks set via fcntl().
 	 */
 	if (MANDMODE((mode_t)zp->z_mode) &&
 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 * Skip this if uio contains loaned arc_buf.
 	 */
 	if ((uio->uio_extflg == UIO_XUIO) &&
 	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 		xuio = (xuio_t *)uio;
 	else
 		uio_prefaultpages(MIN(n, max_blksz), uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		uio->uio_loffset = woff;
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
 
 	if (woff >= limit) {
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
 	/* Will this write extend the file length? */
 	write_eof = (woff + n > zp->z_size);
 
 	end_size = MAX(zp->z_size, woff + n);
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		abuf = NULL;
 		woff = uio->uio_loffset;
 		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		if (xuio && abuf == NULL) {
 			ASSERT(i_iov < iovcnt);
 			aiov = &iovp[i_iov];
 			abuf = dmu_xuio_arcbuf(xuio, i_iov);
 			dmu_xuio_clear(xuio, i_iov);
 			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 			    iovec_t *, aiov, arc_buf_t *, abuf);
 			ASSERT((aiov->iov_base == abuf->b_data) ||
 			    ((char *)aiov->iov_base - (char *)abuf->b_data +
 			    aiov->iov_len == arc_buf_size(abuf)));
 			i_iov++;
 		} else if (abuf == NULL && n >= max_blksz &&
 		    woff >= zp->z_size &&
 		    P2PHASE(woff, max_blksz) == 0 &&
 		    zp->z_blksz == max_blksz) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			size_t cbytes;
 
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    max_blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == max_blksz);
 			if (error = uiocopy(abuf->b_data, max_blksz,
 			    UIO_WRITE, uio, &cbytes)) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT(cbytes == max_blksz);
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * If zfs_range_lock() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since zfs_range_reduce() will
 		 * shrink down r_len to the appropriate size.
 		 */
 		if (rl->r_len == UINT64_MAX) {
 			uint64_t new_blksz;
 
 			if (zp->z_blksz > max_blksz) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				ASSERT(!ISP2(zp->z_blksz));
 				new_blksz = MIN(end_size,
 				    1 << highbit64(zp->z_blksz));
 			} else {
 				new_blksz = MIN(end_size, max_blksz);
 			}
 			zfs_grow_blocksize(zp, new_blksz, tx);
 			zfs_range_reduce(rl, woff, n);
 		}
 
 		/*
 		 * XXX - should we really limit each write to z_max_blksz?
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
 			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 			/*
 			 * If this is not a full block write, but we are
 			 * extending the file past EOF and this data starts
 			 * block-aligned, use assign_arcbuf().  Otherwise,
 			 * write via dmu_write().
 			 */
 			if (tx_bytes < max_blksz && (!write_eof ||
 			    aiov->iov_base != abuf->b_data)) {
 				ASSERT(xuio);
 				dmu_write(zfsvfs->z_os, zp->z_id, woff,
 				    aiov->iov_len, aiov->iov_base, tx);
 				dmu_return_arcbuf(abuf);
 				xuio_stat_wbuf_copied();
 			} else {
 				ASSERT(xuio || tx_bytes == max_blksz);
 				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 				    woff, abuf, tx);
 			}
 			ASSERT(tx_bytes <= uio->uio_resid);
 			uioskip(uio, tx_bytes);
 		}
 		if (tx_bytes && vn_has_cached_data(vp)) {
 			update_pages(vp, woff,
 			    tx_bytes, zfsvfs->z_os, zp->z_id);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		/*
 		 * Clear Set-UID/Set-GID bits on successful write if not
 		 * privileged and at least one of the excute bits is set.
 		 *
 		 * It would be nice to to this after all writes have
 		 * been done, but that would still expose the ISUID/ISGID
 		 * to another app after the partial write is committed.
 		 *
 		 * Note: we don't call zfs_fuid_map_id() here because
 		 * user 0 is not an ephemeral uid.
 		 */
 		mutex_enter(&zp->z_acl_lock);
 		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 		    (S_IXUSR >> 6))) != 0 &&
 		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 		    secpolicy_vnode_setid_retain(cr,
 		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 			uint64_t newmode;
 			zp->z_mode &= ~(S_ISUID | S_ISGID);
 			newmode = zp->z_mode;
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 			    (void *)&newmode, sizeof (uint64_t), tx);
 		}
 		mutex_exit(&zp->z_acl_lock);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < uio->uio_loffset) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    uio->uio_loffset);
 			ASSERT(error == 0);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT(tx_bytes == nbytes);
 		n -= nbytes;
 
 		if (!xuio && n > 0)
 			uio_prefaultpages(MIN(n, max_blksz), uio);
 	}
 
 	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (ioflag & (FSYNC | FDSYNC) ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, zp->z_id);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	znode_t *zp = zgd->zgd_private;
 	objset_t *os = zp->z_zfsvfs->z_os;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_range_unlock(zgd->zgd_rl);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 
 	if (error == 0 && zgd->zgd_bp)
 		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 #ifdef DEBUG
 static int zil_fault_io = 0;
 #endif
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 
 	ASSERT(zio != NULL);
 	ASSERT(size != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_zilog = zfsvfs->z_log;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
 			    RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold(os, object, offset, zgd, &db,
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
 			blkptr_t *obp = dmu_buf_get_blkptr(db);
 			if (obp) {
 				ASSERT(BP_IS_HOLE(bp));
 				*bp = *obp;
 			}
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (flag & V_ACE_MASK)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
 	else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * If vnode is for a device return a specfs vnode instead.
  */
 static int
 specvp_check(vnode_t **vpp, cred_t *cr)
 {
 	int error = 0;
 
 	if (IS_DEVVP(*vpp)) {
 		struct vnode *svp;
 
 		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 		VN_RELE(*vpp);
 		if (svp == NULL)
 			error = SET_ERROR(ENOSYS);
 		*vpp = svp;
 	}
 	return (error);
 }
 
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 /* ARGSUSED */
 static int
 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
     int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
     int *direntflags, pathname_t *realpnp)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*vpp = dvp;
 				VN_HOLD(*vpp);
 				return (0);
 			}
 			return (error);
 		} else if (!zdp->z_zfsvfs->z_norm &&
 		    (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
 
 			vnode_t *tvp = dnlc_lookup(dvp, nm);
 
 			if (tvp) {
 				error = zfs_fastaccesschk_execute(zdp, cr);
 				if (error) {
 					VN_RELE(tvp);
 					return (error);
 				}
 				if (tvp == DNLC_NO_VNODE) {
 					VN_RELE(tvp);
 					return (SET_ERROR(ENOENT));
 				} else {
 					*vpp = tvp;
 					return (specvp_check(vpp, cr));
 				}
 			}
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
 		    B_FALSE, cr)) {
 			VN_RELE(*vpp);
 			*vpp = NULL;
 		}
 
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (dvp->v_type != VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOTDIR));
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
 	if (error == 0)
 		error = specvp_check(vpp, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 
 /* ARGSUSED */
 static int
 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
     vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 top:
 	*vpp = NULL;
 
 	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~VSVTX;
 
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		VN_HOLD(dvp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible VN_HOLD(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = SET_ERROR(EISDIR);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) &&
 		    (vap->va_type != VREG)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				waited = B_TRUE;
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		have_acl = B_FALSE;
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl == EXCL) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
 			error = SET_ERROR(EISDIR);
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if ((ZTOV(zp)->v_type == VREG) &&
 		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			zfs_dirent_unlock(dl);
 			dl = NULL;
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 			if (error == 0) {
 				vnevent_create(ZTOV(zp), ct);
 			}
 		}
 	}
 out:
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
 		error = specvp_check(vpp, cr);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 
 uint64_t null_xattr = 0;
 
 /*ARGSUSED*/
 static int
 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
     int flags)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	znode_t		*xzp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp)) {
 		if (realnmp)
 			pn_free(realnmp);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	if (realnmp)
 		dnlc_remove(dvp, realnmp->pn_buf);
 	else
 		dnlc_remove(dvp, name);
 
 	mutex_enter(&vp->v_lock);
 	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
 	mutex_exit(&vp->v_lock);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig =
 		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (xzp)
 			VN_RELE(ZTOV(xzp));
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		mutex_enter(&vp->v_lock);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
 		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
 		    acl_obj;
 		mutex_exit(&vp->v_lock);
 	}
 
 	if (delete_now) {
 		if (xattr_obj_unlinked) {
 			ASSERT3U(xzp->z_links, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = 1;
 			xzp->z_links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &xzp->z_links, sizeof (xzp->z_links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		mutex_enter(&vp->v_lock);
-		vp->v_count--;
+		VN_RELE_LOCKED(vp);
 		ASSERT0(vp->v_count);
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&zp->z_lock);
 		zfs_znode_delete(zp, tx);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 
 	if (!delete_now)
 		VN_RELE(vp);
 	if (xzp)
 		VN_RELE(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 /*ARGSUSED*/
 static int
 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
     caller_context_t *ct, int flags, vsecattr_t *vsecp)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(vap->va_type == VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	ksid = crgetsid(cr, KSID_OWNER);
 	if (ksid)
 		uid = ksid_getid(ksid);
 	else
 		uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*vpp = NULL;
 
 	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL)) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	*vpp = ZTOV(zp);
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	vnode_t		*vp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vp = ZTOV(zp);
 
 	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	if (vp == cwd) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	/*
 	 * Grab a lock on the directory to make sure that noone is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 out:
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(vp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 /* ARGSUSED */
 static int
 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
     caller_context_t *ct, int flags)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	edirent_t	*eodp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	boolean_t	check_sysattrs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (uio->uio_iov->iov_len <= 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = uio->uio_loffset;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = uio->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 	eodp = (struct edirent *)odp;
 
 	/*
 	 * If this VFS supports the system attribute view interface; and
 	 * we're looking at an extended attribute directory; and we care
 	 * about normalization conflicts on this vfs; then we must check
 	 * for normalization conflicts with the sysattr name space.
 	 */
 	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
 	    (flags & V_RDDIR_ENTFLAGS);
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if (error = zap_cursor_retrieve(&zc, &zap)) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 
 			if (check_sysattrs && !zap.za_normalization_conflict) {
 				zap.za_normalization_conflict =
 				    xattr_sysattr_casechk(zap.za_name);
 			}
 		}
 
 		if (flags & V_RDDIR_ACCFILTER) {
 			/*
 			 * If we have no access at all, don't include
 			 * this entry in the returned information
 			 */
 			znode_t	*ezp;
 			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
 				goto skip_entry;
 			if (!zfs_has_access(ezp, cr)) {
 				VN_RELE(ZTOV(ezp));
 				goto skip_entry;
 			}
 			VN_RELE(ZTOV(ezp));
 		}
 
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
 			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		if (flags & V_RDDIR_ENTFLAGS) {
 			/*
 			 * Add extended flag entry:
 			 */
 			eodp->ed_ino = objnum;
 			eodp->ed_reclen = reclen;
 			/* NOTE: ed_off is the offset for the *next* entry */
 			next = &(eodp->ed_off);
 			eodp->ed_eflags = zap.za_normalization_conflict ?
 			    ED_CASE_CONFLICT : 0;
 			(void) strncpy(eodp->ed_name, zap.za_name,
 			    EDIRENT_NAMELEN(reclen));
 			eodp = (edirent_t *)((intptr_t)eodp + reclen);
 		} else {
 			/*
 			 * Add normal entry:
 			 */
 			odp->d_ino = objnum;
 			odp->d_reclen = reclen;
 			/* NOTE: d_off is the offset for the *next* entry */
 			next = &(odp->d_off);
 			(void) strncpy(odp->d_name, zap.za_name,
 			    DIRENT64_NAMELEN(reclen));
 			odp = (dirent64_t *)((intptr_t)odp + reclen);
 		}
 		outcount += reclen;
 
 		ASSERT(outcount <= bufsize);
 
 		/* Prefetch znode */
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0, 0,
 			    ZIO_PRIORITY_SYNC_READ);
 
 	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 		if (next)
 			*next = offset;
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		uio->uio_resid -= outcount;
 	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = uio->uio_loffset;
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	uio->uio_loffset = offset;
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 ulong_t zfs_fsync_sync_cnt = 4;
 
 static int
 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	/*
 	 * Regardless of whether this is required for standards conformance,
 	 * this is the logical behavior when fsync() is called on a file with
 	 * dirty pages.  We use B_ASYNC since the ZIL transactions are already
 	 * going to be pushed out as part of the zil_commit().
 	 */
 	if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
 	    (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
 
 	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		ZFS_EXIT(zfsvfs);
 	}
 	return (0);
 }
 
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 /* ARGSUSED */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint64_t links;
 	uint64_t mtime[2], ctime[2];
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	mutex_enter(&zp->z_lock);
 	vap->va_type = vp->v_type;
 	vap->va_mode = zp->z_mode & MODEMASK;
 	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
 	vap->va_nodeid = zp->z_id;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
 		links = zp->z_links + 1;
 	else
 		links = zp->z_links;
 	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
 	vap->va_size = zp->z_size;
 	vap->va_rdev = vp->v_rdev;
 	vap->va_seq = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 			uint64_t times[2];
 
 			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
 			    times, sizeof (times));
 			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
 			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 
 	mutex_exit(&zp->z_lock);
 
 	sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	vp	- vnode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 /* ARGSUSED */
 static int
 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (vap->va_size == 0)
 			vnevent_truncate(ZTOV(zp), ct);
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				ZFS_EXIT(zfsvfs);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			ZFS_EXIT(zfsvfs);
 			return (err);
 		}
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID))) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
 				if (attrzp)
 					VN_RELE(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
 			goto out;
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if ((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3U((uintptr_t)aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 		    B_TRUE);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime, B_TRUE);
 		}
 	}
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(vp->v_type == VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (err == ERESTART)
 			goto top;
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			VN_RELE(ZTOV(zl->zl_znode));
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = zp->z_zfsvfs->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (SET_ERROR(EINVAL));
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*tdzp, *szp, *tzp;
 	znode_t		*sdzp = VTOZ(sdvp);
 	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
 	zilog_t		*zilog;
 	vnode_t		*realvp;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0, rm_err = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(sdzp);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure we have the real vp for the target directory.
 	 */
 	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
 		tdvp = realvp;
 
 	tdzp = VTOZ(tdvp);
 	ZFS_VERIFY_ZP(tdzp);
 
 	/*
 	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
 	 * ctldir appear to have the same v_vfsp.
 	 */
 	if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(snm, "..") == 0)
 			serr = SET_ERROR(EINVAL);
 		ZFS_EXIT(zfsvfs);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = SET_ERROR(EINVAL);
 		ZFS_EXIT(zfsvfs);
 		return (terr);
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 
 	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
 		goto out;
 
 	if (ZTOV(szp)->v_type == VDIR) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if (ZTOV(szp)->v_type == VDIR) {
 			if (ZTOV(tzp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto out;
 			}
 		} else {
 			if (ZTOV(tzp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	}
 
 	vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
 	if (tzp)
 		vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME |
 				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
 				    sdl->dl_name, tdzp, tdl->dl_name, szp);
 
 				/*
 				 * Update path information for the target vnode
 				 */
 				vn_renamepath(tdvp, ZTOV(szp), tnm,
 				    strlen(tnm));
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
 				    ZRENAMING, NULL), ==, 0);
 			}
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 	if (tzp && rm_err == 0)
 		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
 
 	if (error == 0) {
 		vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
 		/* notify the target dir if it is not the same as source dir */
 		if (tdvp != sdvp)
 			vnevent_rename_dest_dir(tdvp, ct);
 	}
 out:
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*zp, *dzp = VTOZ(dvp);
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(vap->va_type == VLNK);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids)) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EDQUOT));
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datsets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
 
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	VN_RELE(ZTOV(zp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 /* ARGSUSED */
 static int
 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
     caller_context_t *ct, int flags)
 {
 	znode_t		*dzp = VTOZ(tdvp);
 	znode_t		*tzp, *szp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	vnode_t		*realvp;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(tdvp->v_type == VDIR);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(dzp);
 	zilog = zfsvfs->z_log;
 
 	if (VOP_REALVP(svp, &realvp, ct) == 0)
 		svp = realvp;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (svp->v_type == VDIR) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	szp = VTOZ(svp);
 	ZFS_VERIFY_ZP(szp);
 
 	/*
 	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
 	 * ctldir appear to have the same v_vfsp.
 	 */
 	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (error == 0) {
 		vnevent_link(svp, ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * zfs_null_putapage() is used when the file system has been force
  * unmounted. It just drops the pages.
  */
 /* ARGSUSED */
 static int
 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
     size_t *lenp, int flags, cred_t *cr)
 {
 	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
 	return (0);
 }
 
 /*
  * Push a page out to disk, klustering if possible.
  *
  *	IN:	vp	- file to push page to.
  *		pp	- page to push.
  *		flags	- additional flags.
  *		cr	- credentials of caller.
  *
  *	OUT:	offp	- start of range pushed.
  *		lenp	- len of range pushed.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * NOTE: callers must have locked the page to be pushed.  On
  * exit, the page (and all other pages in the kluster) must be
  * unlocked.
  */
 /* ARGSUSED */
 static int
 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
     size_t *lenp, int flags, cred_t *cr)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t	*tx;
 	u_offset_t	off, koff;
 	size_t		len, klen;
 	int		err;
 
 	off = pp->p_offset;
 	len = PAGESIZE;
 	/*
 	 * If our blocksize is bigger than the page size, try to kluster
 	 * multiple pages so that we write a full block (thus avoiding
 	 * a read-modify-write).
 	 */
 	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
 		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
 		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
 		ASSERT(koff <= zp->z_size);
 		if (koff + klen > zp->z_size)
 			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
 		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
 	}
 	ASSERT3U(btop(len), ==, btopr(len));
 
 	/*
 	 * Can't push pages past end-of-file.
 	 */
 	if (off >= zp->z_size) {
 		/* ignore all pages */
 		err = 0;
 		goto out;
 	} else if (off + len > zp->z_size) {
 		int npages = btopr(zp->z_size - off);
 		page_t *trunc;
 
 		page_list_break(&pp, &trunc, npages);
 		/* ignore pages past end of file */
 		if (trunc)
 			pvn_write_done(trunc, flags);
 		len = zp->z_size - off;
 	}
 
 	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 		err = SET_ERROR(EDQUOT);
 		goto out;
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz <= PAGESIZE) {
 		caddr_t va = zfs_map_page(pp, S_READ);
 		ASSERT3U(len, <=, PAGESIZE);
 		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
 		zfs_unmap_page(pp, va);
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 		    B_TRUE);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
 	if (offp)
 		*offp = off;
 	if (lenp)
 		*lenp = len;
 
 	return (err);
 }
 
 /*
  * Copy the portion of the file indicated from pages into the file.
  * The pages are stored in a page list attached to the files vnode.
  *
  *	IN:	vp	- vnode of file to push page data to.
  *		off	- position in file to put data.
  *		len	- amount of data to write.
  *		flags	- flags to control the operation.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /*ARGSUSED*/
 static int
 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		*pp;
 	size_t		io_len;
 	u_offset_t	io_off;
 	uint_t		blksz;
 	rl_t		*rl;
 	int		error = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * There's nothing to do if no data is cached.
 	 */
 	if (!vn_has_cached_data(vp)) {
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	/*
 	 * Align this request to the file block size in case we kluster.
 	 * XXX - this can result in pretty aggresive locking, which can
 	 * impact simultanious read/write access.  One option might be
 	 * to break up long requests (len == 0) into block-by-block
 	 * operations to get narrower locking.
 	 */
 	blksz = zp->z_blksz;
 	if (ISP2(blksz))
 		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
 	else
 		io_off = 0;
 	if (len > 0 && ISP2(blksz))
 		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
 	else
 		io_len = 0;
 
 	if (io_len == 0) {
 		/*
 		 * Search the entire vp list for pages >= io_off.
 		 */
 		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
 		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
 		goto out;
 	}
 	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
 
 	if (off > zp->z_size) {
 		/* past end of file */
 		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
 
 	for (off = io_off; io_off < off + len; io_off += io_len) {
 		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
 			pp = page_lookup(vp, io_off,
 			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
 		} else {
 			pp = page_lookup_nowait(vp, io_off,
 			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
 		}
 
 		if (pp != NULL && pvn_getdirty(pp, flags)) {
 			int err;
 
 			/*
 			 * Found a dirty page to push
 			 */
 			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
 			if (err)
 				error = err;
 		} else {
 			io_len = PAGESIZE;
 		}
 	}
 out:
 	zfs_range_unlock(rl);
 	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		if (vn_has_cached_data(vp)) {
 			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
 			    B_INVAL, cr);
 		}
 
 		mutex_enter(&zp->z_lock);
 		mutex_enter(&vp->v_lock);
 		ASSERT(vp->v_count == 1);
-		vp->v_count = 0;
+		VN_RELE_LOCKED(vp);
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		zfs_znode_free(zp);
 		return;
 	}
 
 	/*
 	 * Attempt to push any data in the page cache.  If this fails
 	 * we will get kicked out later in zfs_zinactive().
 	 */
 	if (vn_has_cached_data(vp)) {
 		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
 		    cr);
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 /*
  * Bounds-check the seek operation.
  *
  *	IN:	vp	- vnode seeking within
  *		ooff	- old file offset
  *		noffp	- pointer to new file offset
  *		ct	- caller context
  *
  *	RETURN:	0 on success, EINVAL if new offset invalid.
  */
 /* ARGSUSED */
 static int
 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
     caller_context_t *ct)
 {
 	if (vp->v_type == VDIR)
 		return (0);
 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
 }
 
 /*
  * Pre-filter the generic locking function to trap attempts to place
  * a mandatory lock on a memory mapped file.
  */
 static int
 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
     flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	/*
 	 * We are following the UFS semantics with respect to mapcnt
 	 * here: If we see that the file is mapped already, then we will
 	 * return an error, but we don't worry about races between this
 	 * function and zfs_map().
 	 */
 	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EAGAIN));
 	}
 	ZFS_EXIT(zfsvfs);
 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
 }
 
 /*
  * If we can't find a page in the cache, we will create a new page
  * and fill it with file data.  For efficiency, we may try to fill
  * multiple pages at once (klustering) to fill up the supplied page
  * list.  Note that the pages to be filled are held with an exclusive
  * lock to prevent access by other threads while they are being filled.
  */
 static int
 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
 {
 	znode_t *zp = VTOZ(vp);
 	page_t *pp, *cur_pp;
 	objset_t *os = zp->z_zfsvfs->z_os;
 	u_offset_t io_off, total;
 	size_t io_len;
 	int err;
 
 	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
 		/*
 		 * We only have a single page, don't bother klustering
 		 */
 		io_off = off;
 		io_len = PAGESIZE;
 		pp = page_create_va(vp, io_off, io_len,
 		    PG_EXCL | PG_WAIT, seg, addr);
 	} else {
 		/*
 		 * Try to find enough pages to fill the page list
 		 */
 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
 		    &io_len, off, plsz, 0);
 	}
 	if (pp == NULL) {
 		/*
 		 * The page already exists, nothing to do here.
 		 */
 		*pl = NULL;
 		return (0);
 	}
 
 	/*
 	 * Fill the pages in the kluster.
 	 */
 	cur_pp = pp;
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 		caddr_t va;
 
 		ASSERT3U(io_off, ==, cur_pp->p_offset);
 		va = zfs_map_page(cur_pp, S_WRITE);
 		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
 		    DMU_READ_PREFETCH);
 		zfs_unmap_page(cur_pp, va);
 		if (err) {
 			/* On error, toss the entire kluster */
 			pvn_read_done(pp, B_ERROR);
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 		cur_pp = cur_pp->p_next;
 	}
 
 	/*
 	 * Fill in the page list array from the kluster starting
 	 * from the desired offset `off'.
 	 * NOTE: the page list will always be null terminated.
 	 */
 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
 	ASSERT(pl == NULL || (*pl)->p_offset == off);
 
 	return (0);
 }
 
 /*
  * Return pointers to the pages for the file region [off, off + len]
  * in the pl array.  If plsz is greater than len, this function may
  * also return page pointers from after the specified region
  * (i.e. the region [off, off + plsz]).  These additional pages are
  * only returned if they are already in the cache, or were created as
  * part of a klustered read.
  *
  *	IN:	vp	- vnode of file to get data from.
  *		off	- position in file to get data from.
  *		len	- amount of data to retrieve.
  *		plsz	- length of provided page list.
  *		seg	- segment to obtain pages for.
  *		addr	- virtual address of fault.
  *		rw	- mode of created pages.
  *		cr	- credentials of caller.
  *		ct	- caller context.
  *
  *	OUT:	protp	- protection mode of created pages.
  *		pl	- list of pages created.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 /* ARGSUSED */
 static int
 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
     enum seg_rw rw, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	page_t		**pl0 = pl;
 	int		err = 0;
 
 	/* we do our own caching, faultahead is unnecessary */
 	if (pl == NULL)
 		return (0);
 	else if (len > plsz)
 		len = plsz;
 	else
 		len = P2ROUNDUP(len, PAGESIZE);
 	ASSERT(plsz >= len);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (protp)
 		*protp = PROT_ALL;
 
 	/*
 	 * Loop through the requested range [off, off + len) looking
 	 * for pages.  If we don't find a page, we will need to create
 	 * a new page and fill it with data from the file.
 	 */
 	while (len > 0) {
 		if (*pl = page_lookup(vp, off, SE_SHARED))
 			*(pl+1) = NULL;
 		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
 			goto out;
 		while (*pl) {
 			ASSERT3U((*pl)->p_offset, ==, off);
 			off += PAGESIZE;
 			addr += PAGESIZE;
 			if (len > 0) {
 				ASSERT3U(len, >=, PAGESIZE);
 				len -= PAGESIZE;
 			}
 			ASSERT3U(plsz, >=, PAGESIZE);
 			plsz -= PAGESIZE;
 			pl++;
 		}
 	}
 
 	/*
 	 * Fill out the page array with any pages already in the cache.
 	 */
 	while (plsz > 0 &&
 	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
 			off += PAGESIZE;
 			plsz -= PAGESIZE;
 	}
 out:
 	if (err) {
 		/*
 		 * Release any pages we have previously locked.
 		 */
 		while (pl > pl0)
 			page_unlock(*--pl);
 	} else {
 		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	}
 
 	*pl = NULL;
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
 
 /*
  * Request a memory map for a section of a file.  This code interacts
  * with common code and the VM system as follows:
  *
  * - common code calls mmap(), which ends up in smmap_common()
  * - this calls VOP_MAP(), which takes you into (say) zfs
  * - zfs_map() calls as_map(), passing segvn_create() as the callback
  * - segvn_create() creates the new segment and calls VOP_ADDMAP()
  * - zfs_addmap() updates z_mapcnt
  */
 /*ARGSUSED*/
 static int
 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	segvn_crargs_t	vn_a;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((prot & PROT_WRITE) && (zp->z_pflags &
 	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((prot & (PROT_READ | PROT_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (vp->v_flag & VNOMAP) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSYS));
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vp->v_type != VREG) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENODEV));
 	}
 
 	/*
 	 * If file is locked, disallow mapping.
 	 */
 	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EAGAIN));
 	}
 
 	as_rangelock(as);
 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
 	if (error != 0) {
 		as_rangeunlock(as);
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	vn_a.vp = vp;
 	vn_a.offset = (u_offset_t)off;
 	vn_a.type = flags & MAP_TYPE;
 	vn_a.prot = prot;
 	vn_a.maxprot = maxprot;
 	vn_a.cred = cr;
 	vn_a.amp = NULL;
 	vn_a.flags = flags & ~MAP_TYPE;
 	vn_a.szc = 0;
 	vn_a.lgrp_mem_policy_flags = 0;
 
 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
 
 	as_rangeunlock(as);
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
 	return (0);
 }
 
 /*
  * The reason we push dirty pages as part of zfs_delmap() is so that we get a
  * more accurate mtime for the associated file.  Since we don't have a way of
  * detecting when the data was actually modified, we have to resort to
  * heuristics.  If an explicit msync() is done, then we mark the mtime when the
  * last page is pushed.  The problem occurs when the msync() call is omitted,
  * which by far the most common case:
  *
  *	open()
  *	mmap()
  *	<modify memory>
  *	munmap()
  *	close()
  *	<time lapse>
  *	putpage() via fsflush
  *
  * If we wait until fsflush to come along, we can have a modification time that
  * is some arbitrary point in the future.  In order to prevent this in the
  * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
  * torn down.
  */
 /* ARGSUSED */
 static int
 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
     caller_context_t *ct)
 {
 	uint64_t pages = btopr(len);
 
 	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
 	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
 
 	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
 	    vn_has_cached_data(vp))
 		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
 
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	vp	- vnode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller [UNUSED].
  *		ct	- caller context.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime|mtime updated
  */
 /* ARGSUSED */
 static int
 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint64_t	off, len;
 	int		error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if (cmd != F_FREESP) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 	 * callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (error = convoff(vp, bfp, 0, offset)) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	if (bfp->l_len < 0) {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	if (error == 0 && off == 0 && len == 0)
 		vnevent_truncate(ZTOV(zp), ct);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	if (fidp->fid_len < size) {
 		fidp->fid_len = size;
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t		*zp, *xzp;
 	zfsvfs_t	*zfsvfs;
 	zfs_dirlock_t	*dl;
 	int		error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = ULONG_MAX;
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 
 	case _PC_XATTR_EXISTS:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		ZFS_ENTER(zfsvfs);
 		ZFS_VERIFY_ZP(zp);
 		*valp = 0;
 		error = zfs_dirent_lock(&dl, zp, "", &xzp,
 		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
 		if (error == 0) {
 			zfs_dirent_unlock(dl);
 			if (!zfs_dirempty(xzp))
 				*valp = 1;
 			VN_RELE(ZTOV(xzp));
 		} else if (error == ENOENT) {
 			/*
 			 * If there aren't extended attributes, it's the
 			 * same as having zero of them.
 			 */
 			error = 0;
 		}
 		ZFS_EXIT(zfsvfs);
 		return (error);
 
 	case _PC_SATTR_ENABLED:
 	case _PC_SATTR_EXISTS:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
 	case _PC_ACCESS_FILTERING:
 		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
 		    vp->v_type == VDIR;
 		return (0);
 
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
 
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (ulong_t)SPA_MINBLOCKSIZE;
 		return (0);
 
 	case _PC_TIMESTAMP_RESOLUTION:
 		/* nanosecond timestamp resolution */
 		*valp = 1L;
 		return (0);
 
 	default:
 		return (fs_pathconf(vp, cmd, valp, cr, ct));
 	}
 }
 
 /*ARGSUSED*/
 static int
 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	ZFS_EXIT(zfsvfs);
 
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * The smallest read we may consider to loan out an arcbuf.
  * This must be a power of 2.
  */
 int zcr_blksz_min = (1 << 10);	/* 1K */
 /*
  * If set to less than the file block size, allow loaning out of an
  * arcbuf for a partial block read.  This must be a power of 2.
  */
 int zcr_blksz_max = (1 << 17);	/* 128K */
 
 /*ARGSUSED*/
 static int
 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int max_blksz = zfsvfs->z_max_blksz;
 	uio_t *uio = &xuio->xu_uio;
 	ssize_t size = uio->uio_resid;
 	offset_t offset = uio->uio_loffset;
 	int blksz;
 	int fullblk, i;
 	arc_buf_t *abuf;
 	ssize_t maxsize;
 	int preamble, postamble;
 
 	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
 		return (SET_ERROR(EINVAL));
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 	switch (ioflag) {
 	case UIO_WRITE:
 		/*
 		 * Loan out an arc_buf for write if write size is bigger than
 		 * max_blksz, and the file's block size is also max_blksz.
 		 */
 		blksz = max_blksz;
 		if (size < blksz || zp->z_blksz != blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		/*
 		 * Caller requests buffers for write before knowing where the
 		 * write offset might be (e.g. NFS TCP write).
 		 */
 		if (offset == -1) {
 			preamble = 0;
 		} else {
 			preamble = P2PHASE(offset, blksz);
 			if (preamble) {
 				preamble = blksz - preamble;
 				size -= preamble;
 			}
 		}
 
 		postamble = P2PHASE(size, blksz);
 		size -= postamble;
 
 		fullblk = size / blksz;
 		(void) dmu_xuio_init(xuio,
 		    (preamble != 0) + fullblk + (postamble != 0));
 		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
 		    int, postamble, int,
 		    (preamble != 0) + fullblk + (postamble != 0));
 
 		/*
 		 * Have to fix iov base/len for partial buffers.  They
 		 * currently represent full arc_buf's.
 		 */
 		if (preamble) {
 			/* data begins in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf,
 			    blksz - preamble, preamble);
 		}
 
 		for (i = 0; i < fullblk; i++) {
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
 		}
 
 		if (postamble) {
 			/* data ends in the middle of the arc_buf */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf);
 			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
 		}
 		break;
 	case UIO_READ:
 		/*
 		 * Loan out an arc_buf for read if the read size is larger than
 		 * the current file block size.  Block alignment is not
 		 * considered.  Partial arc_buf will be loaned out for read.
 		 */
 		blksz = zp->z_blksz;
 		if (blksz < zcr_blksz_min)
 			blksz = zcr_blksz_min;
 		if (blksz > zcr_blksz_max)
 			blksz = zcr_blksz_max;
 		/* avoid potential complexity of dealing with it */
 		if (blksz > max_blksz) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 
 		maxsize = zp->z_size - uio->uio_loffset;
 		if (size > maxsize)
 			size = maxsize;
 
 		if (size < blksz || vn_has_cached_data(vp)) {
 			ZFS_EXIT(zfsvfs);
 			return (SET_ERROR(EINVAL));
 		}
 		break;
 	default:
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	uio->uio_extflg = UIO_XUIO;
 	XUIO_XUZC_RW(xuio) = ioflag;
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
 {
 	int i;
 	arc_buf_t *abuf;
 	int ioflag = XUIO_XUZC_RW(xuio);
 
 	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
 
 	i = dmu_xuio_cnt(xuio);
 	while (i-- > 0) {
 		abuf = dmu_xuio_arcbuf(xuio, i);
 		/*
 		 * if abuf == NULL, it must be a write buffer
 		 * that has been returned in zfs_write().
 		 */
 		if (abuf)
 			dmu_return_arcbuf(abuf);
 		ASSERT(abuf || ioflag == UIO_WRITE);
 	}
 
 	dmu_xuio_fini(xuio);
 	return (0);
 }
 
 /*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
  * not include arguments => we won't get type mismatch errors
  * in the initializations that follow.
  */
 static int zfs_inval();
 static int zfs_isdir();
 
 static int
 zfs_inval()
 {
 	return (SET_ERROR(EINVAL));
 }
 
 static int
 zfs_isdir()
 {
 	return (SET_ERROR(EISDIR));
 }
 /*
  * Directory vnode operations template
  */
 vnodeops_t *zfs_dvnodeops;
 const fs_operation_def_t zfs_dvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .error = zfs_isdir },
 	VOPNAME_WRITE,		{ .error = zfs_isdir },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Regular file vnode operations template
  */
 vnodeops_t *zfs_fvnodeops;
 const fs_operation_def_t zfs_fvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_READ,		{ .vop_read = zfs_read },
 	VOPNAME_WRITE,		{ .vop_write = zfs_write },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
 	VOPNAME_SPACE,		{ .vop_space = zfs_space },
 	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
 	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
 	VOPNAME_MAP,		{ .vop_map = zfs_map },
 	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
 	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
 	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
 	NULL,			NULL
 };
 
 /*
  * Symbolic link vnode operations template
  */
 vnodeops_t *zfs_symvnodeops;
 const fs_operation_def_t zfs_symvnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * special share hidden files vnode operations template
  */
 vnodeops_t *zfs_sharevnodeops;
 const fs_operation_def_t zfs_sharevnodeops_template[] = {
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Extended attribute directory vnode operations template
  *
  * This template is identical to the directory vnodes
  * operation template except for restricted operations:
  *	VOP_MKDIR()
  *	VOP_SYMLINK()
  *
  * Note that there are other restrictions embedded in:
  *	zfs_create()	- restrict type to VREG
  *	zfs_link()	- no links into/out of attribute space
  *	zfs_rename()	- no moves into/out of attribute space
  */
 vnodeops_t *zfs_xdvnodeops;
 const fs_operation_def_t zfs_xdvnodeops_template[] = {
 	VOPNAME_OPEN,		{ .vop_open = zfs_open },
 	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
 	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
 	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
 	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
 	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
 	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
 	VOPNAME_CREATE,		{ .vop_create = zfs_create },
 	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
 	VOPNAME_LINK,		{ .vop_link = zfs_link },
 	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
 	VOPNAME_MKDIR,		{ .error = zfs_inval },
 	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
 	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
 	VOPNAME_SYMLINK,	{ .error = zfs_inval },
 	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_FID,		{ .vop_fid = zfs_fid },
 	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
 	NULL,			NULL
 };
 
 /*
  * Error vnode operations template
  */
 vnodeops_t *zfs_evnodeops;
 const fs_operation_def_t zfs_evnodeops_template[] = {
 	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
 	NULL,			NULL
 };
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c	(revision 318933)
@@ -1,2130 +1,2130 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 
 #ifdef _KERNEL
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/mntent.h>
 #include <sys/mkdev.h>
 #include <sys/u8_textprep.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/mode.h>
 #include <sys/atomic.h>
 #include <vm/pvn.h>
 #include "fs/fs_subr.h"
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_fuid.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/kidmap.h>
 #endif /* _KERNEL */
 
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/refcount.h>
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_stat.h>
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /*
  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
  * turned on when DEBUG is also defined.
  */
 #ifdef	DEBUG
 #define	ZNODE_STATS
 #endif	/* DEBUG */
 
 #ifdef	ZNODE_STATS
 #define	ZNODE_STAT_ADD(stat)			((stat)++)
 #else
 #define	ZNODE_STAT_ADD(stat)			/* nothing */
 #endif	/* ZNODE_STATS */
 
 /*
  * Functions needed for userland (ie: libzpool) are not put under
  * #ifdef_KERNEL; the rest of the functions have dependencies
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
 /*
  * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
  * be freed before it can be safely accessed.
  */
 krwlock_t zfsvfs_lock;
 
 static kmem_cache_t *znode_cache = NULL;
 
 /*ARGSUSED*/
 static void
 znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
 {
 	/*
 	 * We should never drop all dbuf refs without first clearing
 	 * the eviction callback.
 	 */
 	panic("evicting znode %p\n", user_ptr);
 }
 
 /*ARGSUSED*/
 static int
 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 {
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 
 	zp->z_vnode = vn_alloc(kmflags);
 	if (zp->z_vnode == NULL) {
 		return (-1);
 	}
 	ZTOV(zp)->v_data = zp;
 
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zp->z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
 
 	zp->z_dirlocks = NULL;
 	zp->z_acl_cached = NULL;
 	zp->z_moved = 0;
 	return (0);
 }
 
 /*ARGSUSED*/
 static void
 zfs_znode_cache_destructor(void *buf, void *arg)
 {
 	znode_t *zp = buf;
 
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	ASSERT(ZTOV(zp)->v_data == zp);
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
 	avl_destroy(&zp->z_range_avl);
 	mutex_destroy(&zp->z_range_lock);
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 }
 
 #ifdef	ZNODE_STATS
 static struct {
 	uint64_t zms_zfsvfs_invalid;
 	uint64_t zms_zfsvfs_recheck1;
 	uint64_t zms_zfsvfs_unmounted;
 	uint64_t zms_zfsvfs_recheck2;
 	uint64_t zms_obj_held;
 	uint64_t zms_vnode_locked;
 	uint64_t zms_not_only_dnlc;
 } znode_move_stats;
 #endif	/* ZNODE_STATS */
 
 static void
 zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 {
 	vnode_t *vp;
 
 	/* Copy fields. */
 	nzp->z_zfsvfs = ozp->z_zfsvfs;
 
 	/* Swap vnodes. */
 	vp = nzp->z_vnode;
 	nzp->z_vnode = ozp->z_vnode;
 	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
 	ZTOV(ozp)->v_data = ozp;
 	ZTOV(nzp)->v_data = nzp;
 
 	nzp->z_id = ozp->z_id;
 	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
 	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
 	nzp->z_unlinked = ozp->z_unlinked;
 	nzp->z_atime_dirty = ozp->z_atime_dirty;
 	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
 	nzp->z_blksz = ozp->z_blksz;
 	nzp->z_seq = ozp->z_seq;
 	nzp->z_mapcnt = ozp->z_mapcnt;
 	nzp->z_gen = ozp->z_gen;
 	nzp->z_sync_cnt = ozp->z_sync_cnt;
 	nzp->z_is_sa = ozp->z_is_sa;
 	nzp->z_sa_hdl = ozp->z_sa_hdl;
 	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
 	nzp->z_links = ozp->z_links;
 	nzp->z_size = ozp->z_size;
 	nzp->z_pflags = ozp->z_pflags;
 	nzp->z_uid = ozp->z_uid;
 	nzp->z_gid = ozp->z_gid;
 	nzp->z_mode = ozp->z_mode;
 
 	/*
 	 * Since this is just an idle znode and kmem is already dealing with
 	 * memory pressure, release any cached ACL.
 	 */
 	if (ozp->z_acl_cached) {
 		zfs_acl_free(ozp->z_acl_cached);
 		ozp->z_acl_cached = NULL;
 	}
 
 	sa_set_userp(nzp->z_sa_hdl, nzp);
 
 	/*
 	 * Invalidate the original znode by clearing fields that provide a
 	 * pointer back to the znode. Set the low bit of the vfs pointer to
 	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
 	 * subsequent callback.
 	 */
 	ozp->z_sa_hdl = NULL;
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
 
 	/*
 	 * Mark the znode.
 	 */
 	nzp->z_moved = 1;
 	ozp->z_moved = (uint8_t)-1;
 }
 
 /*ARGSUSED*/
 static kmem_cbrc_t
 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
 {
 	znode_t *ozp = buf, *nzp = newbuf;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	/*
 	 * The znode is on the file system's list of known znodes if the vfs
 	 * pointer is valid. We set the low bit of the vfs pointer when freeing
 	 * the znode to invalidate it, and the memory patterns written by kmem
 	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
 	 * created znode sets the vfs pointer last of all to indicate that the
 	 * znode is known and in a valid state to be moved by this function.
 	 */
 	zfsvfs = ozp->z_zfsvfs;
 	if (!POINTER_IS_VALID(zfsvfs)) {
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * Close a small window in which it's possible that the filesystem could
 	 * be unmounted and freed, and zfsvfs, though valid in the previous
 	 * statement, could point to unrelated memory by the time we try to
 	 * prevent the filesystem from being unmounted.
 	 */
 	rw_enter(&zfsvfs_lock, RW_WRITER);
 	if (zfsvfs != ozp->z_zfsvfs) {
 		rw_exit(&zfsvfs_lock);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * If the znode is still valid, then so is the file system. We know that
 	 * no valid file system can be freed while we hold zfsvfs_lock, so we
 	 * can safely ensure that the filesystem is not and will not be
 	 * unmounted. The next statement is equivalent to ZFS_ENTER().
 	 */
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
 	if (zfsvfs->z_unmounted) {
 		ZFS_EXIT(zfsvfs);
 		rw_exit(&zfsvfs_lock);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 	rw_exit(&zfsvfs_lock);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	/*
 	 * Recheck the vfs pointer in case the znode was removed just before
 	 * acquiring the lock.
 	 */
 	if (zfsvfs != ozp->z_zfsvfs) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
 	/*
 	 * At this point we know that as long as we hold z_znodes_lock, the
 	 * znode cannot be freed and fields within the znode can be safely
 	 * accessed. Now, prevent a race with zfs_zget().
 	 */
 	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
 		return (KMEM_CBRC_LATER);
 	}
 
 	vp = ZTOV(ozp);
 	if (mutex_tryenter(&vp->v_lock) == 0) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/* Only move znodes that are referenced _only_ by the DNLC. */
 	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
 		mutex_exit(&vp->v_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
 		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
 		return (KMEM_CBRC_LATER);
 	}
 
 	/*
 	 * The znode is known and in a valid state to move. We're holding the
 	 * locks needed to execute the critical section.
 	 */
 	zfs_znode_move_impl(ozp, nzp);
 	mutex_exit(&vp->v_lock);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
 
 	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 	ZFS_EXIT(zfsvfs);
 
 	return (KMEM_CBRC_YES);
 }
 
 void
 zfs_znode_init(void)
 {
 	/*
 	 * Initialize zcache
 	 */
 	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
 	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
 	kmem_cache_set_move(znode_cache, zfs_znode_move);
 }
 
 void
 zfs_znode_fini(void)
 {
 	/*
 	 * Cleanup vfs & vnode ops
 	 */
 	zfs_remove_op_tables();
 
 	/*
 	 * Cleanup zcache
 	 */
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
 	rw_destroy(&zfsvfs_lock);
 }
 
 struct vnodeops *zfs_dvnodeops;
 struct vnodeops *zfs_fvnodeops;
 struct vnodeops *zfs_symvnodeops;
 struct vnodeops *zfs_xdvnodeops;
 struct vnodeops *zfs_evnodeops;
 struct vnodeops *zfs_sharevnodeops;
 
 void
 zfs_remove_op_tables()
 {
 	/*
 	 * Remove vfs ops
 	 */
 	ASSERT(zfsfstype);
 	(void) vfs_freevfsops_by_type(zfsfstype);
 	zfsfstype = 0;
 
 	/*
 	 * Remove vnode ops
 	 */
 	if (zfs_dvnodeops)
 		vn_freevnodeops(zfs_dvnodeops);
 	if (zfs_fvnodeops)
 		vn_freevnodeops(zfs_fvnodeops);
 	if (zfs_symvnodeops)
 		vn_freevnodeops(zfs_symvnodeops);
 	if (zfs_xdvnodeops)
 		vn_freevnodeops(zfs_xdvnodeops);
 	if (zfs_evnodeops)
 		vn_freevnodeops(zfs_evnodeops);
 	if (zfs_sharevnodeops)
 		vn_freevnodeops(zfs_sharevnodeops);
 
 	zfs_dvnodeops = NULL;
 	zfs_fvnodeops = NULL;
 	zfs_symvnodeops = NULL;
 	zfs_xdvnodeops = NULL;
 	zfs_evnodeops = NULL;
 	zfs_sharevnodeops = NULL;
 }
 
 extern const fs_operation_def_t zfs_dvnodeops_template[];
 extern const fs_operation_def_t zfs_fvnodeops_template[];
 extern const fs_operation_def_t zfs_xdvnodeops_template[];
 extern const fs_operation_def_t zfs_symvnodeops_template[];
 extern const fs_operation_def_t zfs_evnodeops_template[];
 extern const fs_operation_def_t zfs_sharevnodeops_template[];
 
 int
 zfs_create_op_tables()
 {
 	int error;
 
 	/*
 	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
 	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
 	 * In this case we just return as the ops vectors are already set up.
 	 */
 	if (zfs_dvnodeops)
 		return (0);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
 	    &zfs_dvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
 	    &zfs_fvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
 	    &zfs_symvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
 	    &zfs_xdvnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
 	    &zfs_evnodeops);
 	if (error)
 		return (error);
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
 	    &zfs_sharevnodeops);
 
 	return (error);
 }
 
 int
 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
 	zfs_acl_ids_t acl_ids;
 	vattr_t vattr;
 	znode_t *sharezp;
 	vnode_t *vp;
 	znode_t *zp;
 	int error;
 
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0555;
 	vattr.va_uid = crgetuid(kcred);
 	vattr.va_gid = crgetgid(kcred);
 
 	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
 	sharezp->z_moved = 0;
 	sharezp->z_unlinked = 0;
 	sharezp->z_atime_dirty = 0;
 	sharezp->z_zfsvfs = zfsvfs;
 	sharezp->z_is_sa = zfsvfs->z_use_sa;
 
 	vp = ZTOV(sharezp);
 	vn_reinit(vp);
 	vp->v_type = VDIR;
 
 	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 	    kcred, NULL, &acl_ids));
 	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, sharezp);
 	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
 	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
 	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
 	zfsvfs->z_shares_dir = sharezp->z_id;
 
 	zfs_acl_ids_free(&acl_ids);
 	ZTOV(sharezp)->v_count = 0;
 	sa_handle_destroy(sharezp->z_sa_hdl);
 	kmem_cache_free(znode_cache, sharezp);
 
 	return (error);
 }
 
 /*
  * define a couple of values we need available
  * for both 64 and 32 bit environments.
  */
 #ifndef NBITSMINOR64
 #define	NBITSMINOR64	32
 #endif
 #ifndef MAXMAJ64
 #define	MAXMAJ64	0xffffffffUL
 #endif
 #ifndef	MAXMIN64
 #define	MAXMIN64	0xffffffffUL
 #endif
 
 /*
  * Create special expldev for ZFS private use.
  * Can't use standard expldev since it doesn't do
  * what we want.  The standard expldev() takes a
  * dev32_t in LP64 and expands it to a long dev_t.
  * We need an interface that takes a dev32_t in ILP32
  * and expands it to a long dev_t.
  */
 static uint64_t
 zfs_expldev(dev_t dev)
 {
 #ifndef _LP64
 	major_t major = (major_t)dev >> NBITSMINOR32 & MAXMAJ32;
 	return (((uint64_t)major << NBITSMINOR64) |
 	    ((minor_t)dev & MAXMIN32));
 #else
 	return (dev);
 #endif
 }
 
 /*
  * Special cmpldev for ZFS private use.
  * Can't use standard cmpldev since it takes
  * a long dev_t and compresses it to dev32_t in
  * LP64.  We need to do a compaction of a long dev_t
  * to a dev32_t in ILP32.
  */
 dev_t
 zfs_cmpldev(uint64_t dev)
 {
 #ifndef _LP64
 	minor_t minor = (minor_t)dev & MAXMIN64;
 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
 
 	if (major > MAXMAJ32 || minor > MAXMIN32)
 		return (NODEV32);
 
 	return (((dev32_t)major << NBITSMINOR32) | minor);
 #else
 	return (dev);
 #endif
 }
 
 static void
 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 {
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
 
 	mutex_enter(&zp->z_lock);
 
 	ASSERT(zp->z_sa_hdl == NULL);
 	ASSERT(zp->z_acl_cached == NULL);
 	if (sa_hdl == NULL) {
 		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 		    SA_HDL_SHARED, &zp->z_sa_hdl));
 	} else {
 		zp->z_sa_hdl = sa_hdl;
 		sa_set_userp(sa_hdl, zp);
 	}
 
 	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 
 	/*
 	 * Slap on VROOT if we are the root znode
 	 */
 	if (zp->z_id == zfsvfs->z_root)
 		ZTOV(zp)->v_flag |= VROOT;
 
 	mutex_exit(&zp->z_lock);
 	vn_exists(ZTOV(zp));
 }
 
 void
 zfs_znode_dmu_fini(znode_t *zp)
 {
 	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
 	    zp->z_unlinked ||
 	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
 
 	sa_handle_destroy(zp->z_sa_hdl);
 	zp->z_sa_hdl = NULL;
 }
 
 /*
  * Construct a new znode/vnode and intialize.
  *
  * This does not do a call to dmu_set_user() that is
  * up to the caller to do, in case you don't want to
  * return the znode
  */
 static znode_t *
 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
     dmu_object_type_t obj_type, sa_handle_t *hdl)
 {
 	znode_t	*zp;
 	vnode_t *vp;
 	uint64_t mode;
 	uint64_t parent;
 	sa_bulk_attr_t bulk[9];
 	int count = 0;
 
 	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 
 	ASSERT(zp->z_dirlocks == NULL);
 	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
 	zp->z_moved = 0;
 
 	/*
 	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
 	 * the zfs_znode_move() callback.
 	 */
 	zp->z_sa_hdl = NULL;
 	zp->z_unlinked = 0;
 	zp->z_atime_dirty = 0;
 	zp->z_mapcnt = 0;
 	zp->z_id = db->db_object;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
 	zp->z_sync_cnt = 0;
 
 	vp = ZTOV(zp);
 	vn_reinit(vp);
 
 	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, 8);
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
 		if (hdl == NULL)
 			sa_handle_destroy(zp->z_sa_hdl);
 		kmem_cache_free(znode_cache, zp);
 		return (NULL);
 	}
 
 	zp->z_mode = mode;
 	vp->v_vfsp = zfsvfs->z_parent->z_vfs;
 
 	vp->v_type = IFTOVT((mode_t)mode);
 
 	switch (vp->v_type) {
 	case VDIR:
 		if (zp->z_pflags & ZFS_XATTR) {
 			vn_setops(vp, zfs_xdvnodeops);
 			vp->v_flag |= V_XATTRDIR;
 		} else {
 			vn_setops(vp, zfs_dvnodeops);
 		}
 		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
 		break;
 	case VBLK:
 	case VCHR:
 		{
 			uint64_t rdev;
 			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
 			    &rdev, sizeof (rdev)) == 0);
 
 			vp->v_rdev = zfs_cmpldev(rdev);
 		}
 		/*FALLTHROUGH*/
 	case VFIFO:
 	case VSOCK:
 	case VDOOR:
 		vn_setops(vp, zfs_fvnodeops);
 		break;
 	case VREG:
 		vp->v_flag |= VMODSORT;
 		if (parent == zfsvfs->z_shares_dir) {
 			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
 			vn_setops(vp, zfs_sharevnodeops);
 		} else {
 			vn_setops(vp, zfs_fvnodeops);
 		}
 		break;
 	case VLNK:
 		vn_setops(vp, zfs_symvnodeops);
 		break;
 	default:
 		vn_setops(vp, zfs_evnodeops);
 		break;
 	}
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	membar_producer();
 	/*
 	 * Everything else must be valid before assigning z_zfsvfs makes the
 	 * znode eligible for zfs_znode_move().
 	 */
 	zp->z_zfsvfs = zfsvfs;
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	VFS_HOLD(zfsvfs->z_vfs);
 	return (zp);
 }
 
 static uint64_t empty_xattr;
 static uint64_t pad[4];
 static zfs_acl_phys_t acl_phys;
 /*
  * Create a new DMU object to hold a zfs znode.
  *
  *	IN:	dzp	- parent directory for new znode
  *		vap	- file attributes for new znode
  *		tx	- dmu transaction id for zap operations
  *		cr	- credentials of caller
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
  *		bonuslen - length of bonus buffer
  *		setaclp  - File/Dir initial ACL
  *		fuidp	 - Tracks fuid allocation.
  *
  *	OUT:	zpp	- allocated znode
  *
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 {
 	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
 	uint64_t	mode, size, links, parent, pflags;
 	uint64_t	dzp_pflags = 0;
 	uint64_t	rdev = 0;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	dmu_buf_t	*db;
 	timestruc_t	now;
 	uint64_t	gen, obj;
 	int		bonuslen;
 	sa_handle_t	*sa_hdl;
 	dmu_object_type_t obj_type;
 	sa_bulk_attr_t	sa_attrs[ZPL_END];
 	int		cnt = 0;
 	zfs_acl_locator_cb_t locate = { 0 };
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
 		obj = 0;
 		gethrestime(&now);
 		gen = dmu_tx_get_txg(tx);
 	}
 
 	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 	bonuslen = (obj_type == DMU_OT_SA) ?
 	    DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
 
 	/*
 	 * Create a new DMU object.
 	 */
 	/*
 	 * There's currently no mechanism for pre-reading the blocks that will
 	 * be needed to allocate a new object, so we accept the small chance
 	 * that there will be an i/o error and we will fail one of the
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
 		if (zfsvfs->z_replay) {
 			VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, tx));
 		} else {
 			obj = zap_create_norm(zfsvfs->z_os,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    obj_type, bonuslen, tx);
 		}
 	} else {
 		if (zfsvfs->z_replay) {
 			VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, tx));
 		} else {
 			obj = dmu_object_alloc(zfsvfs->z_os,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    obj_type, bonuslen, tx);
 		}
 	}
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 
 	/*
 	 * If this is the root, fix up the half-initialized parent pointer
 	 * to reference the just-allocated physical data area.
 	 */
 	if (flag & IS_ROOT_NODE) {
 		dzp->z_id = obj;
 	} else {
 		dzp_pflags = dzp->z_pflags;
 	}
 
 	/*
 	 * If parent is an xattr, so am I.
 	 */
 	if (dzp_pflags & ZFS_XATTR) {
 		flag |= IS_XATTR;
 	}
 
 	if (zfsvfs->z_use_fuids)
 		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 	else
 		pflags = 0;
 
 	if (vap->va_type == VDIR) {
 		size = 2;		/* contents ("." and "..") */
 		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
 	} else {
 		size = links = 0;
 	}
 
 	if (vap->va_type == VBLK || vap->va_type == VCHR) {
 		rdev = zfs_expldev(vap->va_rdev);
 	}
 
 	parent = dzp->z_id;
 	mode = acl_ids->z_mode;
 	if (flag & IS_XATTR)
 		pflags |= ZFS_XATTR;
 
 	/*
 	 * No execs denied will be deterimed when zfs_mode_compute() is called.
 	 */
 	pflags |= acl_ids->z_aclp->z_hints &
 	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 
 	ZFS_TIME_ENCODE(&now, crtime);
 	ZFS_TIME_ENCODE(&now, ctime);
 
 	if (vap->va_mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, atime);
 	} else {
 		ZFS_TIME_ENCODE(&now, atime);
 	}
 
 	if (vap->va_mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 	} else {
 		ZFS_TIME_ENCODE(&now, mtime);
 	}
 
 	/* Now add in all of the "SA" attributes */
 	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 	    &sa_hdl));
 
 	/*
 	 * Setup the array of attributes to be replaced/set on the new file
 	 *
 	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 	 * in the old znode_phys_t format.  Don't change this ordering
 	 */
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 	} else {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 		    NULL, &mode, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 		    NULL, &size, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 		    NULL, &gen, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 		    NULL, &parent, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 		    NULL, &atime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 		    NULL, &mtime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, 16);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 		    NULL, &crtime, 16);
 	}
 
 	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 		    &empty_xattr, 8);
 	}
 	if (obj_type == DMU_OT_ZNODE ||
 	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 		    NULL, &rdev, 8);
 
 	}
 	if (obj_type == DMU_OT_ZNODE) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &pflags, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 		    &acl_ids->z_fuid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 		    &acl_ids->z_fgid, 8);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 		    sizeof (uint64_t) * 4);
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 		    &acl_phys, sizeof (zfs_acl_phys_t));
 	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 		    &acl_ids->z_aclp->z_acl_count, 8);
 		locate.cb_aclp = acl_ids->z_aclp;
 		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 		    zfs_acl_data_locator, &locate,
 		    acl_ids->z_aclp->z_acl_bytes);
 		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 		    acl_ids->z_fuid, acl_ids->z_fgid);
 	}
 
 	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 
 	if (!(flag & IS_ROOT_NODE)) {
 		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 		ASSERT(*zpp != NULL);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
 		 * passed in is the znode for the root.
 		 */
 		*zpp = dzp;
 
 		(*zpp)->z_sa_hdl = sa_hdl;
 	}
 
 	(*zpp)->z_pflags = pflags;
 	(*zpp)->z_mode = mode;
 
 	if (vap->va_mask & AT_XVATTR)
 		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
 
 	if (obj_type == DMU_OT_ZNODE ||
 	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
 /*
  * Update in-core attributes.  It is assumed the caller will be doing an
  * sa_bulk_update to push the changes out.
  */
 void
 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 {
 	xoptattr_t *xoap;
 
 	xoap = xva_getxoptattr(xvap);
 	ASSERT(xoap);
 
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 		uint64_t times[2];
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
 		    &times, sizeof (times), tx);
 		XVA_SET_RTN(xvap, XAT_CREATETIME);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_READONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_HIDDEN);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SYSTEM);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_ARCHIVE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NOUNLINK);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_APPENDONLY);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_NODUMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OPAQUE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
 		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		zfs_sa_set_scanstamp(zp, xvap, tx);
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_REPARSE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_OFFLINE);
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
 		    zp->z_pflags, tx);
 		XVA_SET_RTN(xvap, XAT_SPARSE);
 	}
 }
 
 int
 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 {
 	dmu_object_info_t doi;
 	dmu_buf_t	*db;
 	znode_t		*zp;
 	int err;
 	sa_handle_t	*hdl;
 
 	*zpp = NULL;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EINVAL));
 	}
 
 	hdl = dmu_buf_get_user(db);
 	if (hdl != NULL) {
 		zp  = sa_get_userdata(hdl);
 
 
 		/*
 		 * Since "SA" does immediate eviction we
 		 * should never find a sa handle that doesn't
 		 * know about the znode.
 		 */
 
 		ASSERT3P(zp, !=, NULL);
 
 		mutex_enter(&zp->z_lock);
 		ASSERT3U(zp->z_id, ==, obj_num);
 		if (zp->z_unlinked) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			VN_HOLD(ZTOV(zp));
 			*zpp = zp;
 			err = 0;
 		}
 		mutex_exit(&zp->z_lock);
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	/*
 	 * Not found create new znode/vnode
 	 * but only if file exists.
 	 *
 	 * There is a small window where zfs_vget() could
 	 * find this object while a file create is still in
 	 * progress.  This is checked for in zfs_znode_alloc()
 	 *
 	 * if zfs_znode_alloc() fails it will drop the hold on the
 	 * bonus buffer.
 	 */
 	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
 	    doi.doi_bonus_type, NULL);
 	if (zp == NULL) {
 		err = SET_ERROR(ENOENT);
 	} else {
 		*zpp = zp;
 	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 	return (err);
 }
 
 int
 zfs_rezget(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_object_info_t doi;
 	dmu_buf_t *db;
 	uint64_t obj_num = zp->z_id;
 	uint64_t mode;
 	sa_bulk_attr_t bulk[8];
 	int err;
 	int count = 0;
 	uint64_t gen;
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
 
 	mutex_enter(&zp->z_acl_lock);
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	mutex_exit(&zp->z_acl_lock);
 	ASSERT(zp->z_sa_hdl == NULL);
 	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
 	if (err) {
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (err);
 	}
 
 	dmu_object_info_from_db(db, &doi);
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
 	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
 		sa_buf_rele(db, NULL);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
 
 	/* reload cached values */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
 	    &gen, sizeof (gen));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, sizeof (zp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &zp->z_links, sizeof (zp->z_links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 	    &zp->z_atime, sizeof (zp->z_atime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 	    &zp->z_uid, sizeof (zp->z_uid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 	    &zp->z_gid, sizeof (zp->z_gid));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 	    &mode, sizeof (mode));
 
 	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	zp->z_mode = mode;
 
 	if (gen != zp->z_gen) {
 		zfs_znode_dmu_fini(zp);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 		return (SET_ERROR(EIO));
 	}
 
 	zp->z_unlinked = (zp->z_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
 
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
 
 	return (0);
 }
 
 void
 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	objset_t *os = zfsvfs->z_os;
 	uint64_t obj = zp->z_id;
 	uint64_t acl_obj = zfs_external_acl(zp);
 
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	if (acl_obj) {
 		VERIFY(!zp->z_is_sa);
 		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
 	}
 	VERIFY(0 == dmu_object_free(os, obj, tx));
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 	zfs_znode_free(zp);
 }
 
 void
 zfs_zinactive(znode_t *zp)
 {
 	vnode_t	*vp = ZTOV(zp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	uint64_t z_id = zp->z_id;
 
 	ASSERT(zp->z_sa_hdl);
 
 	/*
 	 * Don't allow a zfs_zget() while were trying to release this znode
 	 */
 	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
 
 	mutex_enter(&zp->z_lock);
 	mutex_enter(&vp->v_lock);
-	vp->v_count--;
+	VN_RELE_LOCKED(vp);
 	if (vp->v_count > 0 || vn_has_cached_data(vp)) {
 		/*
 		 * If the hold count is greater than zero, somebody has
 		 * obtained a new reference on this znode while we were
 		 * processing it here, so we are done.  If we still have
 		 * mapped pages then we are also done, since we don't
 		 * want to inactivate the znode until the pages get pushed.
 		 *
 		 * XXX - if vn_has_cached_data(vp) is true, but count == 0,
 		 * this seems like it would leave the znode hanging with
 		 * no chance to go inactive...
 		 */
 		mutex_exit(&vp->v_lock);
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		return;
 	}
 	mutex_exit(&vp->v_lock);
 
 	/*
 	 * If this was the last reference to a file with no links,
 	 * remove the file from the file system.
 	 */
 	if (zp->z_unlinked) {
 		mutex_exit(&zp->z_lock);
 		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 		zfs_rmnode(zp);
 		return;
 	}
 
 	mutex_exit(&zp->z_lock);
 	zfs_znode_dmu_fini(zp);
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
 	zfs_znode_free(zp);
 }
 
 void
 zfs_znode_free(znode_t *zp)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	vn_invalid(ZTOV(zp));
 
 	ASSERT(ZTOV(zp)->v_count == 0);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	POINTER_INVALIDATE(&zp->z_zfsvfs);
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	if (zp->z_acl_cached) {
 		zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = NULL;
 	}
 
 	kmem_cache_free(znode_cache, zp);
 
 	VFS_RELE(zfsvfs->z_vfs);
 }
 
 void
 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
     uint64_t ctime[2], boolean_t have_tx)
 {
 	timestruc_t	now;
 
 	gethrestime(&now);
 
 	if (have_tx) {	/* will sa_bulk_update happen really soon? */
 		zp->z_atime_dirty = 0;
 		zp->z_seq++;
 	} else {
 		zp->z_atime_dirty = 1;
 	}
 
 	if (flag & AT_ATIME) {
 		ZFS_TIME_ENCODE(&now, zp->z_atime);
 	}
 
 	if (flag & AT_MTIME) {
 		ZFS_TIME_ENCODE(&now, mtime);
 		if (zp->z_zfsvfs->z_use_fuids) {
 			zp->z_pflags |= (ZFS_ARCHIVE |
 			    ZFS_AV_MODIFIED);
 		}
 	}
 
 	if (flag & AT_CTIME) {
 		ZFS_TIME_ENCODE(&now, ctime);
 		if (zp->z_zfsvfs->z_use_fuids)
 			zp->z_pflags |= ZFS_ARCHIVE;
 	}
 }
 
 /*
  * Grow the block size for a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		size	- requested block size
  *		tx	- open transaction.
  *
  * NOTE: this function assumes that the znode is write locked.
  */
 void
 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
 {
 	int		error;
 	u_longlong_t	dummy;
 
 	if (size <= zp->z_blksz)
 		return;
 	/*
 	 * If the file size is already greater than the current blocksize,
 	 * we will not grow.  If there is more than one block in a file,
 	 * the blocksize cannot change.
 	 */
 	if (zp->z_blksz && zp->z_size > zp->z_blksz)
 		return;
 
 	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
 	    size, 0, tx);
 
 	if (error == ENOTSUP)
 		return;
 	ASSERT0(error);
 
 	/* What blocksize did we actually get? */
 	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
 }
 
 /*
  * This is a dummy interface used when pvn_vplist_dirty() should *not*
  * be calling back into the fs for a putpage().  E.g.: when truncating
  * a file, the pages being "thrown away* don't need to be written out.
  */
 /* ARGSUSED */
 static int
 zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
     int flags, cred_t *cr)
 {
 	ASSERT(0);
 	return (0);
 }
 
 /*
  * Increase the file length
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_extend(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	dmu_tx_t *tx;
 	rl_t *rl;
 	uint64_t newblksz;
 	int error;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end <= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	if (end > zp->z_blksz &&
 	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
 		/*
 		 * We are growing the file past the current block size.
 		 */
 		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
 			/*
 			 * File's blocksize is already larger than the
 			 * "recordsize" property.  Only let it grow to
 			 * the next power of 2.
 			 */
 			ASSERT(!ISP2(zp->z_blksz));
 			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
 		} else {
 			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
 		}
 		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
 	} else {
 		newblksz = 0;
 	}
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	if (newblksz)
 		zfs_grow_blocksize(zp, newblksz, tx);
 
 	zp->z_size = end;
 
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx));
 
 	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Free space in a file.
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of section to free.
  *		len	- length of section to free.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	rl_t *rl;
 	int error;
 
 	/*
 	 * Lock the range being freed.
 	 */
 	rl = zfs_range_lock(zp, off, len, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (off >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	if (off + len > zp->z_size)
 		len = zp->z_size - off;
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
 
 	zfs_range_unlock(rl);
 
 	return (error);
 }
 
 /*
  * Truncate a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		end	- new end-of-file.
  *
  *	RETURN:	0 on success, error code on failure
  */
 static int
 zfs_trunc(znode_t *zp, uint64_t end)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	rl_t *rl;
 	int error;
 	sa_bulk_attr_t bulk[2];
 	int count = 0;
 
 	/*
 	 * We will change zp_size, lock the whole file.
 	 */
 	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 
 	/*
 	 * Nothing to do if file already at desired length.
 	 */
 	if (end >= zp->z_size) {
 		zfs_range_unlock(rl);
 		return (0);
 	}
 
 	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
 	if (error) {
 		zfs_range_unlock(rl);
 		return (error);
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_range_unlock(rl);
 		return (error);
 	}
 
 	zp->z_size = end;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &zp->z_size, sizeof (zp->z_size));
 
 	if (end == 0) {
 		zp->z_pflags &= ~ZFS_SPARSE;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, 8);
 	}
 	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
 
 	dmu_tx_commit(tx);
 
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
 	if (vn_has_cached_data(vp)) {
 		page_t *pp;
 		uint64_t start = end & PAGEMASK;
 		int poff = end & PAGEOFFSET;
 
 		if (poff != 0 && (pp = page_lookup(vp, start, SE_SHARED))) {
 			/*
 			 * We need to zero a partial page.
 			 */
 			pagezero(pp, poff, PAGESIZE - poff);
 			start += PAGESIZE;
 			page_unlock(pp);
 		}
 		error = pvn_vplist_dirty(vp, start, zfs_no_putpage,
 		    B_INVAL | B_TRUNC, NULL);
 		ASSERT(error == 0);
 	}
 
 	zfs_range_unlock(rl);
 
 	return (0);
 }
 
 /*
  * Free space in a file
  *
  *	IN:	zp	- znode of file to free data in.
  *		off	- start of range
  *		len	- end of range (0 => EOF)
  *		flag	- current file open mode flags.
  *		log	- TRUE if this action should be logged
  *
  *	RETURN:	0 on success, error code on failure
  */
 int
 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 {
 	vnode_t *vp = ZTOV(zp);
 	dmu_tx_t *tx;
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog = zfsvfs->z_log;
 	uint64_t mode;
 	uint64_t mtime[2], ctime[2];
 	sa_bulk_attr_t bulk[3];
 	int count = 0;
 	int error;
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
 	    sizeof (mode))) != 0)
 		return (error);
 
 	if (off > zp->z_size) {
 		error =  zfs_extend(zp, off+len);
 		if (error == 0 && log)
 			goto log;
 		else
 			return (error);
 	}
 
 	/*
 	 * Check for any locks in the region to be freed.
 	 */
 
 	if (MANDLOCK(vp, (mode_t)mode)) {
 		uint64_t length = (len ? len : zp->z_size - off);
 		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
 			return (error);
 	}
 
 	if (len == 0) {
 		error = zfs_trunc(zp, off);
 	} else {
 		if ((error = zfs_free_range(zp, off, len)) == 0 &&
 		    off + len > zp->z_size)
 			error = zfs_extend(zp, off+len);
 	}
 	if (error || !log)
 		return (error);
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &zp->z_pflags, 8);
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT(error == 0);
 
 	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 
 	dmu_tx_commit(tx);
 	return (0);
 }
 
 void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	uint64_t	moid, obj, sa_obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
 	int		i;
 	znode_t		*rootzp = NULL;
 	zfsvfs_t	*zfsvfs;
 	vnode_t		*vp;
 	vattr_t		vattr;
 	znode_t		*zp;
 	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
 	 */
 	/*
 	 * In an empty objset, there are no blocks to read and thus
 	 * there can be no i/o errors (which we assert below).
 	 */
 	moid = MASTER_NODE_OBJ;
 	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Set starting attributes.
 	 */
 	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
 		uint64_t val;
 		char *name;
 
 		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
 			if (val < version)
 				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
 		ASSERT(error == 0);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
 			norm = val;
 		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
 			sense = val;
 	}
 	ASSERT(version != 0);
 	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
 	/*
 	 * Create zap object used for SA attribute registration
 	 */
 
 	if (version >= ZPL_VERSION_SA) {
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT(error == 0);
 	} else {
 		sa_obj = 0;
 	}
 	/*
 	 * Create a delete queue.
 	 */
 	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
 	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 
 	/*
 	 * Create root znode.  Create minimal znode/vnode/zfsvfs
 	 * to allow zfs_mknode to work.
 	 */
 	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 	vattr.va_type = VDIR;
 	vattr.va_mode = S_IFDIR|0755;
 	vattr.va_uid = crgetuid(cr);
 	vattr.va_gid = crgetgid(cr);
 
 	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_moved = 0;
 	rootzp->z_unlinked = 0;
 	rootzp->z_atime_dirty = 0;
 	rootzp->z_is_sa = USE_SA(version, os);
 
 	vp = ZTOV(rootzp);
 	vn_reinit(vp);
 	vp->v_type = VDIR;
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 	zfsvfs->z_os = os;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_version = version;
 	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
 	zfsvfs->z_use_sa = USE_SA(version, os);
 	zfsvfs->z_norm = norm;
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 
 	ASSERT(error == 0);
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	rootzp->z_zfsvfs = zfsvfs;
 	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
 	    cr, NULL, &acl_ids));
 	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
 	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	ZTOV(rootzp)->v_count = 0;
 	sa_handle_destroy(rootzp->z_sa_hdl);
 	kmem_cache_free(znode_cache, rootzp);
 
 	/*
 	 * Create shares directory
 	 */
 
 	error = zfs_create_share_dir(zfsvfs, tx);
 
 	ASSERT(error == 0);
 
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 #endif /* _KERNEL */
 
 static int
 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
 {
 	uint64_t sa_obj = 0;
 	int error;
 
 	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
 	if (error != 0 && error != ENOENT)
 		return (error);
 
 	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
 	return (error);
 }
 
 static int
 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
     dmu_buf_t **db, void *tag)
 {
 	dmu_object_info_t doi;
 	int error;
 
 	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
 		return (error);
 
 	dmu_object_info_from_db(*db, &doi);
 	if ((doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
 	    doi.doi_bonus_type == DMU_OT_ZNODE &&
 	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
 		sa_buf_rele(*db, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
 	if (error != 0) {
 		sa_buf_rele(*db, tag);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
 {
 	sa_handle_destroy(hdl);
 	sa_buf_rele(db, tag);
 }
 
 /*
  * Given an object number, return its parent object number and whether
  * or not the object is an extended attribute directory.
  */
 static int
 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
     uint64_t *pobjp, int *is_xattrdir)
 {
 	uint64_t parent;
 	uint64_t pflags;
 	uint64_t mode;
 	uint64_t parent_mode;
 	sa_bulk_attr_t bulk[3];
 	sa_handle_t *sa_hdl;
 	dmu_buf_t *sa_db;
 	int count = 0;
 	int error;
 
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
 	    &parent, sizeof (parent));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
 	    &pflags, sizeof (pflags));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
 	    &mode, sizeof (mode));
 
 	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
 		return (error);
 
 	/*
 	 * When a link is removed its parent pointer is not changed and will
 	 * be invalid.  There are two cases where a link is removed but the
 	 * file stays around, when it goes to the delete queue and when there
 	 * are additional links.
 	 */
 	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
 	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	if (error != 0)
 		return (error);
 
 	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
 
 	/*
 	 * Extended attributes can be applied to files, directories, etc.
 	 * Otherwise the parent must be a directory.
 	 */
 	if (!*is_xattrdir && !S_ISDIR(parent_mode))
 		return (SET_ERROR(EINVAL));
 
 	*pobjp = parent;
 
 	return (0);
 }
 
 /*
  * Given an object number, return some zpl level statistics
  */
 static int
 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
     zfs_stat_t *sb)
 {
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
 	    &sb->zs_mode, sizeof (sb->zs_mode));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
 	    &sb->zs_gen, sizeof (sb->zs_gen));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
 	    &sb->zs_links, sizeof (sb->zs_links));
 	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
 	    &sb->zs_ctime, sizeof (sb->zs_ctime));
 
 	return (sa_bulk_lookup(hdl, bulk, count));
 }
 
 static int
 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
     sa_attr_type_t *sa_table, char *buf, int len)
 {
 	sa_handle_t *sa_hdl;
 	sa_handle_t *prevhdl = NULL;
 	dmu_buf_t *prevdb = NULL;
 	dmu_buf_t *sa_db = NULL;
 	char *path = buf + len - 1;
 	int error;
 
 	*path = '\0';
 	sa_hdl = hdl;
 
 	for (;;) {
 		uint64_t pobj;
 		char component[MAXNAMELEN + 2];
 		size_t complen;
 		int is_xattrdir;
 
 		if (prevdb)
 			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
 
 		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
 		    &is_xattrdir)) != 0)
 			break;
 
 		if (pobj == obj) {
 			if (path[0] != '/')
 				*--path = '/';
 			break;
 		}
 
 		component[0] = '/';
 		if (is_xattrdir) {
 			(void) sprintf(component + 1, "<xattrdir>");
 		} else {
 			error = zap_value_search(osp, pobj, obj,
 			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
 			if (error != 0)
 				break;
 		}
 
 		complen = strlen(component);
 		path -= complen;
 		ASSERT(path >= buf);
 		bcopy(component, path, complen);
 		obj = pobj;
 
 		if (sa_hdl != hdl) {
 			prevhdl = sa_hdl;
 			prevdb = sa_db;
 		}
 		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
 		if (error != 0) {
 			sa_hdl = prevhdl;
 			sa_db = prevdb;
 			break;
 		}
 	}
 
 	if (sa_hdl != NULL && sa_hdl != hdl) {
 		ASSERT(sa_db != NULL);
 		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
 	}
 
 	if (error == 0)
 		(void) memmove(buf, path, buf + len - path);
 
 	return (error);
 }
 
 int
 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
 {
 	sa_attr_type_t *sa_table;
 	sa_handle_t *hdl;
 	dmu_buf_t *db;
 	int error;
 
 	error = zfs_sa_setup(osp, &sa_table);
 	if (error != 0)
 		return (error);
 
 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
 	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
 
 int
 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
     char *buf, int len)
 {
 	char *path = buf + len - 1;
 	sa_attr_type_t *sa_table;
 	sa_handle_t *hdl;
 	dmu_buf_t *db;
 	int error;
 
 	*path = '\0';
 
 	error = zfs_sa_setup(osp, &sa_table);
 	if (error != 0)
 		return (error);
 
 	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
 	if (error != 0) {
 		zfs_release_sa_handle(hdl, db, FTAG);
 		return (error);
 	}
 
 	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
 
 	zfs_release_sa_handle(hdl, db, FTAG);
 	return (error);
 }
Index: vendor-sys/illumos/dist/uts/common/sys/vnode.h
===================================================================
--- vendor-sys/illumos/dist/uts/common/sys/vnode.h	(revision 318932)
+++ vendor-sys/illumos/dist/uts/common/sys/vnode.h	(revision 318933)
@@ -1,1448 +1,1479 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
  * The Regents of the University of California
  * All Rights Reserved
  *
  * University Acknowledgment- Portions of this document are derived from
  * software developed by the University of California, Berkeley, and its
  * contributors.
  */
 
 #ifndef _SYS_VNODE_H
 #define	_SYS_VNODE_H
 
 #include <sys/types.h>
 #include <sys/t_lock.h>
 #include <sys/rwstlock.h>
 #include <sys/time_impl.h>
 #include <sys/cred.h>
 #include <sys/uio.h>
 #include <sys/resource.h>
 #include <vm/seg_enum.h>
 #include <sys/kstat.h>
 #include <sys/kmem.h>
 #include <sys/list.h>
 #ifdef	_KERNEL
 #include <sys/buf.h>
+#include <sys/sdt.h>
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Statistics for all vnode operations.
  * All operations record number of ops (since boot/mount/zero'ed).
  * Certain I/O operations (read, write, readdir) also record number
  * of bytes transferred.
  * This appears in two places in the system: one is embedded in each
  * vfs_t.  There is also an array of vopstats_t structures allocated
  * on a per-fstype basis.
  */
 
 #define	VOPSTATS_STR	"vopstats_"	/* Initial string for vopstat kstats */
 
 typedef struct vopstats {
 	kstat_named_t	nopen;		/* VOP_OPEN */
 	kstat_named_t	nclose;		/* VOP_CLOSE */
 	kstat_named_t	nread;		/* VOP_READ */
 	kstat_named_t	read_bytes;
 	kstat_named_t	nwrite;		/* VOP_WRITE */
 	kstat_named_t	write_bytes;
 	kstat_named_t	nioctl;		/* VOP_IOCTL */
 	kstat_named_t	nsetfl;		/* VOP_SETFL */
 	kstat_named_t	ngetattr;	/* VOP_GETATTR */
 	kstat_named_t	nsetattr;	/* VOP_SETATTR */
 	kstat_named_t	naccess;	/* VOP_ACCESS */
 	kstat_named_t	nlookup;	/* VOP_LOOKUP */
 	kstat_named_t	ncreate;	/* VOP_CREATE */
 	kstat_named_t	nremove;	/* VOP_REMOVE */
 	kstat_named_t	nlink;		/* VOP_LINK */
 	kstat_named_t	nrename;	/* VOP_RENAME */
 	kstat_named_t	nmkdir;		/* VOP_MKDIR */
 	kstat_named_t	nrmdir;		/* VOP_RMDIR */
 	kstat_named_t	nreaddir;	/* VOP_READDIR */
 	kstat_named_t	readdir_bytes;
 	kstat_named_t	nsymlink;	/* VOP_SYMLINK */
 	kstat_named_t	nreadlink;	/* VOP_READLINK */
 	kstat_named_t	nfsync;		/* VOP_FSYNC */
 	kstat_named_t	ninactive;	/* VOP_INACTIVE */
 	kstat_named_t	nfid;		/* VOP_FID */
 	kstat_named_t	nrwlock;	/* VOP_RWLOCK */
 	kstat_named_t	nrwunlock;	/* VOP_RWUNLOCK */
 	kstat_named_t	nseek;		/* VOP_SEEK */
 	kstat_named_t	ncmp;		/* VOP_CMP */
 	kstat_named_t	nfrlock;	/* VOP_FRLOCK */
 	kstat_named_t	nspace;		/* VOP_SPACE */
 	kstat_named_t	nrealvp;	/* VOP_REALVP */
 	kstat_named_t	ngetpage;	/* VOP_GETPAGE */
 	kstat_named_t	nputpage;	/* VOP_PUTPAGE */
 	kstat_named_t	nmap;		/* VOP_MAP */
 	kstat_named_t	naddmap;	/* VOP_ADDMAP */
 	kstat_named_t	ndelmap;	/* VOP_DELMAP */
 	kstat_named_t	npoll;		/* VOP_POLL */
 	kstat_named_t	ndump;		/* VOP_DUMP */
 	kstat_named_t	npathconf;	/* VOP_PATHCONF */
 	kstat_named_t	npageio;	/* VOP_PAGEIO */
 	kstat_named_t	ndumpctl;	/* VOP_DUMPCTL */
 	kstat_named_t	ndispose;	/* VOP_DISPOSE */
 	kstat_named_t	nsetsecattr;	/* VOP_SETSECATTR */
 	kstat_named_t	ngetsecattr;	/* VOP_GETSECATTR */
 	kstat_named_t	nshrlock;	/* VOP_SHRLOCK */
 	kstat_named_t	nvnevent;	/* VOP_VNEVENT */
 	kstat_named_t	nreqzcbuf;	/* VOP_REQZCBUF */
 	kstat_named_t	nretzcbuf;	/* VOP_RETZCBUF */
 } vopstats_t;
 
 /*
  * The vnode is the focus of all file activity in UNIX.
  * A vnode is allocated for each active file, each current
  * directory, each mounted-on file, and the root.
  *
  * Each vnode is usually associated with a file-system-specific node (for
  * UFS, this is the in-memory inode).  Generally, a vnode and an fs-node
  * should be created and destroyed together as a pair.
  *
  * If a vnode is reused for a new file, it should be reinitialized by calling
  * either vn_reinit() or vn_recycle().
  *
  * vn_reinit() resets the entire vnode as if it was returned by vn_alloc().
  * The caller is responsible for setting up the entire vnode after calling
  * vn_reinit().  This is important when using kmem caching where the vnode is
  * allocated by a constructor, for instance.
  *
  * vn_recycle() is used when the file system keeps some state around in both
  * the vnode and the associated FS-node.  In UFS, for example, the inode of
  * a deleted file can be reused immediately.  The v_data, v_vfsp, v_op, etc.
  * remains the same but certain fields related to the previous instance need
  * to be reset.  In particular:
  *	v_femhead
  *	v_path
  *	v_rdcnt, v_wrcnt
  *	v_mmap_read, v_mmap_write
  */
 
 /*
  * vnode types.  VNON means no type.  These values are unrelated to
  * values in on-disk inodes.
  */
 typedef enum vtype {
 	VNON	= 0,
 	VREG	= 1,
 	VDIR	= 2,
 	VBLK	= 3,
 	VCHR	= 4,
 	VLNK	= 5,
 	VFIFO	= 6,
 	VDOOR	= 7,
 	VPROC	= 8,
 	VSOCK	= 9,
 	VPORT	= 10,
 	VBAD	= 11
 } vtype_t;
 
 /*
  * VSD - Vnode Specific Data
  * Used to associate additional private data with a vnode.
  */
 struct vsd_node {
 	list_node_t vs_nodes;		/* list of all VSD nodes */
 	uint_t vs_nkeys;		/* entries in value array */
 	void **vs_value;		/* array of value/key */
 };
 
 /*
  * Many of the fields in the vnode are read-only once they are initialized
  * at vnode creation time.  Other fields are protected by locks.
  *
  * IMPORTANT: vnodes should be created ONLY by calls to vn_alloc().  They
  * may not be embedded into the file-system specific node (inode).  The
  * size of vnodes may change.
  *
  * The v_lock protects:
  *   v_flag
  *   v_stream
  *   v_count
  *   v_shrlocks
  *   v_path
  *   v_vsd
  *   v_xattrdir
  *
  * A special lock (implemented by vn_vfswlock in vnode.c) protects:
  *   v_vfsmountedhere
  *
  * The global flock_lock mutex (in flock.c) protects:
  *   v_filocks
  *
  * IMPORTANT NOTE:
  *
  *   The following vnode fields are considered public and may safely be
  *   accessed by file systems or other consumers:
  *
  *     v_lock
  *     v_flag
  *     v_count
  *     v_data
  *     v_vfsp
  *     v_stream
  *     v_type
  *     v_rdev
  *
  * ALL OTHER FIELDS SHOULD BE ACCESSED ONLY BY THE OWNER OF THAT FIELD.
  * In particular, file systems should not access other fields; they may
  * change or even be removed.  The functionality which was once provided
  * by these fields is available through vn_* functions.
  */
 
 struct fem_head;	/* from fem.h */
 
 typedef struct vnode {
 	kmutex_t	v_lock;		/* protects vnode fields */
 	uint_t		v_flag;		/* vnode flags (see below) */
 	uint_t		v_count;	/* reference count */
 	void		*v_data;	/* private data for fs */
 	struct vfs	*v_vfsp;	/* ptr to containing VFS */
 	struct stdata	*v_stream;	/* associated stream */
 	enum vtype	v_type;		/* vnode type */
 	dev_t		v_rdev;		/* device (VCHR, VBLK) */
 
 	/* PRIVATE FIELDS BELOW - DO NOT USE */
 
 	struct vfs	*v_vfsmountedhere; /* ptr to vfs mounted here */
 	struct vnodeops	*v_op;		/* vnode operations */
 	struct page	*v_pages;	/* vnode pages list */
 	struct filock	*v_filocks;	/* ptr to filock list */
 	struct shrlocklist *v_shrlocks;	/* ptr to shrlock list */
 	krwlock_t	v_nbllock;	/* sync for NBMAND locks */
 	kcondvar_t	v_cv;		/* synchronize locking */
 	void		*v_locality;	/* hook for locality info */
 	struct fem_head	*v_femhead;	/* fs monitoring */
 	char		*v_path;	/* cached path */
 	uint_t		v_rdcnt;	/* open for read count  (VREG only) */
 	uint_t		v_wrcnt;	/* open for write count (VREG only) */
 	u_longlong_t	v_mmap_read;	/* mmap read count */
 	u_longlong_t	v_mmap_write;	/* mmap write count */
 	void		*v_mpssdata;	/* info for large page mappings */
 	void		*v_fopdata;	/* list of file ops event watches */
 	kmutex_t	v_vsd_lock;	/* protects v_vsd field */
 	struct vsd_node *v_vsd;		/* vnode specific data */
 	struct vnode	*v_xattrdir;	/* unnamed extended attr dir (GFS) */
 	uint_t		v_count_dnlc;	/* dnlc reference count */
 } vnode_t;
 
 #define	IS_DEVVP(vp)	\
 	((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO)
 
 #define	VNODE_ALIGN	64
 /* Count of low-order 0 bits in a vnode *, based on size and alignment. */
 #if defined(_LP64)
 #define	VNODE_ALIGN_LOG2	8
 #else
 #define	VNODE_ALIGN_LOG2	7
 #endif
 
 /*
  * vnode flags.
  */
 #define	VROOT		0x01	/* root of its file system */
 #define	VNOCACHE	0x02	/* don't keep cache pages on vnode */
 #define	VNOMAP		0x04	/* file cannot be mapped/faulted */
 #define	VDUP		0x08	/* file should be dup'ed rather then opened */
 #define	VNOSWAP		0x10	/* file cannot be used as virtual swap device */
 #define	VNOMOUNT	0x20	/* file cannot be covered by mount */
 #define	VISSWAP		0x40	/* vnode is being used for swap */
 #define	VSWAPLIKE	0x80	/* vnode acts like swap (but may not be) */
 
 #define	IS_SWAPVP(vp)	(((vp)->v_flag & (VISSWAP | VSWAPLIKE)) != 0)
 
 typedef struct vn_vfslocks_entry {
 	rwstlock_t ve_lock;
 	void *ve_vpvfs;
 	struct vn_vfslocks_entry *ve_next;
 	uint32_t ve_refcnt;
 	char pad[64 - sizeof (rwstlock_t) - 2 * sizeof (void *) - \
 	    sizeof (uint32_t)];
 } vn_vfslocks_entry_t;
 
 /*
  * The following two flags are used to lock the v_vfsmountedhere field
  */
 #define	VVFSLOCK	0x100
 #define	VVFSWAIT	0x200
 
 /*
  * Used to serialize VM operations on a vnode
  */
 #define	VVMLOCK		0x400
 
 /*
  * Tell vn_open() not to fail a directory open for writing but
  * to go ahead and call VOP_OPEN() to let the filesystem check.
  */
 #define	VDIROPEN	0x800
 
 /*
  * Flag to let the VM system know that this file is most likely a binary
  * or shared library since it has been mmap()ed EXEC at some time.
  */
 #define	VVMEXEC		0x1000
 
 #define	VPXFS		0x2000  /* clustering: global fs proxy vnode */
 
 #define	IS_PXFSVP(vp)	((vp)->v_flag & VPXFS)
 
 #define	V_XATTRDIR	0x4000	/* attribute unnamed directory */
 
 #define	IS_XATTRDIR(vp)	((vp)->v_flag & V_XATTRDIR)
 
 #define	V_LOCALITY	0x8000	/* whether locality aware */
 
 /*
  * Flag that indicates the VM should maintain the v_pages list with all modified
  * pages on one end and unmodified pages at the other. This makes finding dirty
  * pages to write back to disk much faster at the expense of taking a minor
  * fault on the first store instruction which touches a writable page.
  */
 #define	VMODSORT	(0x10000)
 #define	IS_VMODSORT(vp) \
 	(pvn_vmodsort_supported != 0 && ((vp)->v_flag  & VMODSORT) != 0)
 
 #define	VISSWAPFS	0x20000	/* vnode is being used for swapfs */
 
 /*
  * The mdb memstat command assumes that IS_SWAPFSVP only uses the
  * vnode's v_flag field.  If this changes, cache the additional
  * fields in mdb; see vn_get in mdb/common/modules/genunix/memory.c
  */
 #define	IS_SWAPFSVP(vp)	(((vp)->v_flag & VISSWAPFS) != 0)
 
 #define	V_SYSATTR	0x40000	/* vnode is a GFS system attribute */
 
 /*
  * Vnode attributes.  A bit-mask is supplied as part of the
  * structure to indicate the attributes the caller wants to
  * set (setattr) or extract (getattr).
  */
 
 /*
  * Note that va_nodeid and va_nblocks are 64bit data type.
  * We support large files over NFSV3. With Solaris client and
  * Server that generates 64bit ino's and sizes these fields
  * will overflow if they are 32 bit sizes.
  */
 
 typedef struct vattr {
 	uint_t		va_mask;	/* bit-mask of attributes */
 	vtype_t		va_type;	/* vnode type (for create) */
 	mode_t		va_mode;	/* file access mode */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	dev_t		va_fsid;	/* file system id (dev for now) */
 	u_longlong_t	va_nodeid;	/* node id */
 	nlink_t		va_nlink;	/* number of references to file */
 	u_offset_t	va_size;	/* file size in bytes */
 	timestruc_t	va_atime;	/* time of last access */
 	timestruc_t	va_mtime;	/* time of last modification */
 	timestruc_t	va_ctime;	/* time of last status change */
 	dev_t		va_rdev;	/* device the file represents */
 	uint_t		va_blksize;	/* fundamental block size */
 	u_longlong_t	va_nblocks;	/* # of blocks allocated */
 	uint_t		va_seq;		/* sequence number */
 } vattr_t;
 
 #define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
 
 /*
  * Structure of all optional attributes.
  */
 typedef struct xoptattr {
 	timestruc_t	xoa_createtime;	/* Create time of file */
 	uint8_t		xoa_archive;
 	uint8_t		xoa_system;
 	uint8_t		xoa_readonly;
 	uint8_t		xoa_hidden;
 	uint8_t		xoa_nounlink;
 	uint8_t		xoa_immutable;
 	uint8_t		xoa_appendonly;
 	uint8_t		xoa_nodump;
 	uint8_t		xoa_opaque;
 	uint8_t		xoa_av_quarantined;
 	uint8_t		xoa_av_modified;
 	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
 	uint8_t		xoa_reparse;
 	uint64_t	xoa_generation;
 	uint8_t		xoa_offline;
 	uint8_t		xoa_sparse;
 } xoptattr_t;
 
 /*
  * The xvattr structure is really a variable length structure that
  * is made up of:
  * - The classic vattr_t (xva_vattr)
  * - a 32 bit quantity (xva_mapsize) that specifies the size of the
  *   attribute bitmaps in 32 bit words.
  * - A pointer to the returned attribute bitmap (needed because the
  *   previous element, the requested attribute bitmap) is variable lenth.
  * - The requested attribute bitmap, which is an array of 32 bit words.
  *   Callers use the XVA_SET_REQ() macro to set the bits corresponding to
  *   the attributes that are being requested.
  * - The returned attribute bitmap, which is an array of 32 bit words.
  *   File systems that support optional attributes use the XVA_SET_RTN()
  *   macro to set the bits corresponding to the attributes that are being
  *   returned.
  * - The xoptattr_t structure which contains the attribute values
  *
  * xva_mapsize determines how many words in the attribute bitmaps.
  * Immediately following the attribute bitmaps is the xoptattr_t.
  * xva_getxoptattr() is used to get the pointer to the xoptattr_t
  * section.
  */
 
 #define	XVA_MAPSIZE	3		/* Size of attr bitmaps */
 #define	XVA_MAGIC	0x78766174	/* Magic # for verification */
 
 /*
  * The xvattr structure is an extensible structure which permits optional
  * attributes to be requested/returned.  File systems may or may not support
  * optional attributes.  They do so at their own discretion but if they do
  * support optional attributes, they must register the VFSFT_XVATTR feature
  * so that the optional attributes can be set/retrived.
  *
  * The fields of the xvattr structure are:
  *
  * xva_vattr - The first element of an xvattr is a legacy vattr structure
  * which includes the common attributes.  If AT_XVATTR is set in the va_mask
  * then the entire structure is treated as an xvattr.  If AT_XVATTR is not
  * set, then only the xva_vattr structure can be used.
  *
  * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification.
  *
  * xva_mapsize - Size of requested and returned attribute bitmaps.
  *
  * xva_rtnattrmapp - Pointer to xva_rtnattrmap[].  We need this since the
  * size of the array before it, xva_reqattrmap[], could change which means
  * the location of xva_rtnattrmap[] could change.  This will allow unbundled
  * file systems to find the location of xva_rtnattrmap[] when the sizes change.
  *
  * xva_reqattrmap[] - Array of requested attributes.  Attributes are
  * represented by a specific bit in a specific element of the attribute
  * map array.  Callers set the bits corresponding to the attributes
  * that the caller wants to get/set.
  *
  * xva_rtnattrmap[] - Array of attributes that the file system was able to
  * process.  Not all file systems support all optional attributes.  This map
  * informs the caller which attributes the underlying file system was able
  * to set/get.  (Same structure as the requested attributes array in terms
  * of each attribute  corresponding to specific bits and array elements.)
  *
  * xva_xoptattrs - Structure containing values of optional attributes.
  * These values are only valid if the corresponding bits in xva_reqattrmap
  * are set and the underlying file system supports those attributes.
  */
 typedef struct xvattr {
 	vattr_t		xva_vattr;	/* Embedded vattr structure */
 	uint32_t	xva_magic;	/* Magic Number */
 	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
 	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
 	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
 	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
 	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
 } xvattr_t;
 
 #ifdef _SYSCALL32
 /*
  * For bigtypes time_t changed to 64 bit on the 64-bit kernel.
  * Define an old version for user/kernel interface
  */
 
 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
 #pragma pack(4)
 #endif
 
 typedef struct vattr32 {
 	uint32_t	va_mask;	/* bit-mask of attributes */
 	vtype_t		va_type;	/* vnode type (for create) */
 	mode32_t	va_mode;	/* file access mode */
 	uid32_t		va_uid;		/* owner user id */
 	gid32_t		va_gid;		/* owner group id */
 	dev32_t		va_fsid;	/* file system id (dev for now) */
 	u_longlong_t	va_nodeid;	/* node id */
 	nlink_t		va_nlink;	/* number of references to file */
 	u_offset_t	va_size;	/* file size in bytes */
 	timestruc32_t	va_atime;	/* time of last access */
 	timestruc32_t	va_mtime;	/* time of last modification */
 	timestruc32_t	va_ctime;	/* time of last status change */
 	dev32_t		va_rdev;	/* device the file represents */
 	uint32_t	va_blksize;	/* fundamental block size */
 	u_longlong_t	va_nblocks;	/* # of blocks allocated */
 	uint32_t	va_seq;		/* sequence number */
 } vattr32_t;
 
 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
 #pragma pack()
 #endif
 
 #else  /* not _SYSCALL32 */
 #define	vattr32		vattr
 typedef vattr_t		vattr32_t;
 #endif /* _SYSCALL32 */
 
 /*
  * Attributes of interest to the caller of setattr or getattr.
  */
 #define	AT_TYPE		0x00001
 #define	AT_MODE		0x00002
 #define	AT_UID		0x00004
 #define	AT_GID		0x00008
 #define	AT_FSID		0x00010
 #define	AT_NODEID	0x00020
 #define	AT_NLINK	0x00040
 #define	AT_SIZE		0x00080
 #define	AT_ATIME	0x00100
 #define	AT_MTIME	0x00200
 #define	AT_CTIME	0x00400
 #define	AT_RDEV		0x00800
 #define	AT_BLKSIZE	0x01000
 #define	AT_NBLOCKS	0x02000
 /*			0x04000 */	/* unused */
 #define	AT_SEQ		0x08000
 /*
  * If AT_XVATTR is set then there are additional bits to process in
  * the xvattr_t's attribute bitmap.  If this is not set then the bitmap
  * MUST be ignored.  Note that this bit must be set/cleared explicitly.
  * That is, setting AT_ALL will NOT set AT_XVATTR.
  */
 #define	AT_XVATTR	0x10000
 
 #define	AT_ALL		(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
 			AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\
 			AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
 
 #define	AT_STAT		(AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
 			AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_TYPE)
 
 #define	AT_TIMES	(AT_ATIME|AT_MTIME|AT_CTIME)
 
 #define	AT_NOSET	(AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
 			AT_BLKSIZE|AT_NBLOCKS|AT_SEQ)
 
 /*
  * Attribute bits used in the extensible attribute's (xva's) attribute
  * bitmaps.  Note that the bitmaps are made up of a variable length number
  * of 32-bit words.  The convention is to use XAT{n}_{attrname} where "n"
  * is the element in the bitmap (starting at 1).  This convention is for
  * the convenience of the maintainer to keep track of which element each
  * attribute belongs to.
  *
  * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY.  CONSUMERS
  * MUST USE THE XAT_* DEFINES.
  */
 #define	XAT0_INDEX	0LL		/* Index into bitmap for XAT0 attrs */
 #define	XAT0_CREATETIME	0x00000001	/* Create time of file */
 #define	XAT0_ARCHIVE	0x00000002	/* Archive */
 #define	XAT0_SYSTEM	0x00000004	/* System */
 #define	XAT0_READONLY	0x00000008	/* Readonly */
 #define	XAT0_HIDDEN	0x00000010	/* Hidden */
 #define	XAT0_NOUNLINK	0x00000020	/* Nounlink */
 #define	XAT0_IMMUTABLE	0x00000040	/* immutable */
 #define	XAT0_APPENDONLY	0x00000080	/* appendonly */
 #define	XAT0_NODUMP	0x00000100	/* nodump */
 #define	XAT0_OPAQUE	0x00000200	/* opaque */
 #define	XAT0_AV_QUARANTINED	0x00000400	/* anti-virus quarantine */
 #define	XAT0_AV_MODIFIED	0x00000800	/* anti-virus modified */
 #define	XAT0_AV_SCANSTAMP	0x00001000	/* anti-virus scanstamp */
 #define	XAT0_REPARSE	0x00002000	/* FS reparse point */
 #define	XAT0_GEN	0x00004000	/* object generation number */
 #define	XAT0_OFFLINE	0x00008000	/* offline */
 #define	XAT0_SPARSE	0x00010000	/* sparse */
 
 #define	XAT0_ALL_ATTRS	(XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \
     XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \
     XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED|  XAT0_AV_MODIFIED| \
     XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE)
 
 /* Support for XAT_* optional attributes */
 #define	XVA_MASK		0xffffffff	/* Used to mask off 32 bits */
 #define	XVA_SHFT		32		/* Used to shift index */
 
 /*
  * Used to pry out the index and attribute bits from the XAT_* attributes
  * defined below.  Note that we're masking things down to 32 bits then
  * casting to uint32_t.
  */
 #define	XVA_INDEX(attr)		((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK))
 #define	XVA_ATTRBIT(attr)	((uint32_t)((attr) & XVA_MASK))
 
 /*
  * The following defines present a "flat namespace" so that consumers don't
  * need to keep track of which element belongs to which bitmap entry.
  *
  * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER
  */
 #define	XAT_CREATETIME		((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME)
 #define	XAT_ARCHIVE		((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE)
 #define	XAT_SYSTEM		((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM)
 #define	XAT_READONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY)
 #define	XAT_HIDDEN		((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN)
 #define	XAT_NOUNLINK		((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK)
 #define	XAT_IMMUTABLE		((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE)
 #define	XAT_APPENDONLY		((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY)
 #define	XAT_NODUMP		((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP)
 #define	XAT_OPAQUE		((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE)
 #define	XAT_AV_QUARANTINED	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED)
 #define	XAT_AV_MODIFIED		((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED)
 #define	XAT_AV_SCANSTAMP	((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP)
 #define	XAT_REPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE)
 #define	XAT_GEN			((XAT0_INDEX << XVA_SHFT) | XAT0_GEN)
 #define	XAT_OFFLINE		((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE)
 #define	XAT_SPARSE		((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE)
 
 /*
  * The returned attribute map array (xva_rtnattrmap[]) is located past the
  * requested attribute map array (xva_reqattrmap[]).  Its location changes
  * when the array sizes change.  We use a separate pointer in a known location
  * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[].  This is
  * set in xva_init()
  */
 #define	XVA_RTNATTRMAP(xvap)	((xvap)->xva_rtnattrmapp)
 
 /*
  * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap
  * of requested attributes (xva_reqattrmap[]).
  */
 #define	XVA_SET_REQ(xvap, attr)					\
 	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
 	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
 	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr)
 /*
  * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap
  * of requested attributes (xva_reqattrmap[]).
  */
 #define	XVA_CLR_REQ(xvap, attr)					\
 	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
 	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
 	(xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr)
 
 /*
  * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap
  * of returned attributes (xva_rtnattrmap[]).
  */
 #define	XVA_SET_RTN(xvap, attr)					\
 	ASSERT((xvap)->xva_vattr.va_mask | AT_XVATTR);		\
 	ASSERT((xvap)->xva_magic == XVA_MAGIC);			\
 	(XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr)
 
 /*
  * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[])
  * to see of the corresponding attribute bit is set.  If so, returns non-zero.
  */
 #define	XVA_ISSET_REQ(xvap, attr)					\
 	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
 		((xvap)->xva_magic == XVA_MAGIC) &&			\
 		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
 	((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) :	0)
 
 /*
  * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[])
  * to see of the corresponding attribute bit is set.  If so, returns non-zero.
  */
 #define	XVA_ISSET_RTN(xvap, attr)					\
 	((((xvap)->xva_vattr.va_mask | AT_XVATTR) &&			\
 		((xvap)->xva_magic == XVA_MAGIC) &&			\
 		((xvap)->xva_mapsize > XVA_INDEX(attr))) ?		\
 	((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0)
 
 /*
  *  Modes.  Some values same as S_xxx entries from stat.h for convenience.
  */
 #define	VSUID		04000		/* set user id on execution */
 #define	VSGID		02000		/* set group id on execution */
 #define	VSVTX		01000		/* save swapped text even after use */
 
 /*
  * Permissions.
  */
 #define	VREAD		00400
 #define	VWRITE		00200
 #define	VEXEC		00100
 
 #define	MODEMASK	07777		/* mode bits plus permission bits */
 #define	PERMMASK	00777		/* permission bits */
 
 /*
  * VOP_ACCESS flags
  */
 #define	V_ACE_MASK	0x1	/* mask represents  NFSv4 ACE permissions */
 #define	V_APPEND	0x2	/* want to do append only check */
 
 /*
  * Check whether mandatory file locking is enabled.
  */
 
 #define	MANDMODE(mode)		(((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 #define	MANDLOCK(vp, mode)	((vp)->v_type == VREG && MANDMODE(mode))
 
 /*
  * Flags for vnode operations.
  */
 enum rm		{ RMFILE, RMDIRECTORY };	/* rm or rmdir (remove) */
 enum symfollow	{ NO_FOLLOW, FOLLOW };		/* follow symlinks (or not) */
 enum vcexcl	{ NONEXCL, EXCL };		/* (non)excl create */
 enum create	{ CRCREAT, CRMKNOD, CRMKDIR };	/* reason for create */
 
 typedef enum rm		rm_t;
 typedef enum symfollow	symfollow_t;
 typedef enum vcexcl	vcexcl_t;
 typedef enum create	create_t;
 
 /*
  * Vnode Events - Used by VOP_VNEVENT
  * The VE_PRE_RENAME_* events fire before the rename operation and are
  * primarily used for specialized applications, such as NFSv4 delegation, which
  * need to know about rename before it occurs.
  */
 typedef enum vnevent	{
 	VE_SUPPORT	= 0,	/* Query */
 	VE_RENAME_SRC	= 1,	/* Rename, with vnode as source */
 	VE_RENAME_DEST	= 2,	/* Rename, with vnode as target/destination */
 	VE_REMOVE	= 3,	/* Remove of vnode's name */
 	VE_RMDIR	= 4,	/* Remove of directory vnode's name */
 	VE_CREATE	= 5,	/* Create with vnode's name which exists */
 	VE_LINK		= 6, 	/* Link with vnode's name as source */
 	VE_RENAME_DEST_DIR	= 7, 	/* Rename with vnode as target dir */
 	VE_MOUNTEDOVER	= 8, 	/* File or Filesystem got mounted over vnode */
 	VE_TRUNCATE = 9,	/* Truncate */
 	VE_PRE_RENAME_SRC = 10,	/* Pre-rename, with vnode as source */
 	VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */
 	VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */
 } vnevent_t;
 
 /*
  * Values for checking vnode open and map counts
  */
 enum v_mode { V_READ, V_WRITE, V_RDORWR, V_RDANDWR };
 
 typedef enum v_mode v_mode_t;
 
 #define	V_TRUE	1
 #define	V_FALSE	0
 
 /*
  * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations
  */
 
 typedef struct vsecattr {
 	uint_t		vsa_mask;	/* See below */
 	int		vsa_aclcnt;	/* ACL entry count */
 	void		*vsa_aclentp;	/* pointer to ACL entries */
 	int		vsa_dfaclcnt;	/* default ACL entry count */
 	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
 	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
 	uint_t		vsa_aclflags;	/* ACE ACL flags */
 } vsecattr_t;
 
 /* vsa_mask values */
 #define	VSA_ACL			0x0001
 #define	VSA_ACLCNT		0x0002
 #define	VSA_DFACL		0x0004
 #define	VSA_DFACLCNT		0x0008
 #define	VSA_ACE			0x0010
 #define	VSA_ACECNT		0x0020
 #define	VSA_ACE_ALLTYPES	0x0040
 #define	VSA_ACE_ACLFLAGS	0x0080	/* get/set ACE ACL flags */
 
 /*
  * Structure used by various vnode operations to determine
  * the context (pid, host, identity) of a caller.
  *
  * The cc_caller_id is used to identify one or more callers who invoke
  * operations, possibly on behalf of others.  For example, the NFS
  * server could have it's own cc_caller_id which can be detected by
  * vnode/vfs operations or (FEM) monitors on those operations.  New
  * caller IDs are generated by fs_new_caller_id().
  */
 typedef struct caller_context {
 	pid_t		cc_pid;		/* Process ID of the caller */
 	int		cc_sysid;	/* System ID, used for remote calls */
 	u_longlong_t	cc_caller_id;	/* Identifier for (set of) caller(s) */
 	ulong_t		cc_flags;
 } caller_context_t;
 
 /*
  * Flags for caller context.  The caller sets CC_DONTBLOCK if it does not
  * want to block inside of a FEM monitor.  The monitor will set CC_WOULDBLOCK
  * and return EAGAIN if the operation would have blocked.
  */
 #define	CC_WOULDBLOCK	0x01
 #define	CC_DONTBLOCK	0x02
 
 /*
  * Structure tags for function prototypes, defined elsewhere.
  */
 struct pathname;
 struct fid;
 struct flock64;
 struct flk_callback;
 struct shrlock;
 struct page;
 struct seg;
 struct as;
 struct pollhead;
 struct taskq;
 
 #ifdef	_KERNEL
 
 /*
  * VNODE_OPS defines all the vnode operations.  It is used to define
  * the vnodeops structure (below) and the fs_func_p union (vfs_opreg.h).
  */
 #define	VNODE_OPS							\
 	int	(*vop_open)(vnode_t **, int, cred_t *,			\
 				caller_context_t *);			\
 	int	(*vop_close)(vnode_t *, int, int, offset_t, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_read)(vnode_t *, uio_t *, int, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_write)(vnode_t *, uio_t *, int, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_ioctl)(vnode_t *, int, intptr_t, int, cred_t *,	\
 				int *, caller_context_t *);		\
 	int	(*vop_setfl)(vnode_t *, int, int, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_getattr)(vnode_t *, vattr_t *, int, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_setattr)(vnode_t *, vattr_t *, int, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_access)(vnode_t *, int, int, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_lookup)(vnode_t *, char *, vnode_t **,		\
 				struct pathname *,			\
 				int, vnode_t *, cred_t *,		\
 				caller_context_t *, int *,		\
 				struct pathname *);			\
 	int	(*vop_create)(vnode_t *, char *, vattr_t *, vcexcl_t,	\
 				int, vnode_t **, cred_t *, int,		\
 				caller_context_t *, vsecattr_t *);	\
 	int	(*vop_remove)(vnode_t *, char *, cred_t *,		\
 				caller_context_t *, int);		\
 	int	(*vop_link)(vnode_t *, vnode_t *, char *, cred_t *,	\
 				caller_context_t *, int);		\
 	int	(*vop_rename)(vnode_t *, char *, vnode_t *, char *,	\
 				cred_t *, caller_context_t *, int);	\
 	int	(*vop_mkdir)(vnode_t *, char *, vattr_t *, vnode_t **,	\
 				cred_t *, caller_context_t *, int,	\
 				vsecattr_t *);				\
 	int	(*vop_rmdir)(vnode_t *, char *, vnode_t *, cred_t *,	\
 				caller_context_t *, int);		\
 	int	(*vop_readdir)(vnode_t *, uio_t *, cred_t *, int *,	\
 				caller_context_t *, int);		\
 	int	(*vop_symlink)(vnode_t *, char *, vattr_t *, char *,	\
 				cred_t *, caller_context_t *, int);	\
 	int	(*vop_readlink)(vnode_t *, uio_t *, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_fsync)(vnode_t *, int, cred_t *,			\
 				caller_context_t *);			\
 	void	(*vop_inactive)(vnode_t *, cred_t *,			\
 				caller_context_t *);			\
 	int	(*vop_fid)(vnode_t *, struct fid *,			\
 				caller_context_t *);			\
 	int	(*vop_rwlock)(vnode_t *, int, caller_context_t *);	\
 	void	(*vop_rwunlock)(vnode_t *, int, caller_context_t *);	\
 	int	(*vop_seek)(vnode_t *, offset_t, offset_t *,		\
 				caller_context_t *);			\
 	int	(*vop_cmp)(vnode_t *, vnode_t *, caller_context_t *);	\
 	int	(*vop_frlock)(vnode_t *, int, struct flock64 *,		\
 				int, offset_t,				\
 				struct flk_callback *, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_space)(vnode_t *, int, struct flock64 *,		\
 				int, offset_t,				\
 				cred_t *, caller_context_t *);		\
 	int	(*vop_realvp)(vnode_t *, vnode_t **,			\
 				caller_context_t *);			\
 	int	(*vop_getpage)(vnode_t *, offset_t, size_t, uint_t *,	\
 				struct page **, size_t, struct seg *,	\
 				caddr_t, enum seg_rw, cred_t *,		\
 				caller_context_t *);			\
 	int	(*vop_putpage)(vnode_t *, offset_t, size_t,		\
 				int, cred_t *, caller_context_t *);	\
 	int	(*vop_map)(vnode_t *, offset_t, struct as *,		\
 				caddr_t *, size_t,			\
 				uchar_t, uchar_t, uint_t, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_addmap)(vnode_t *, offset_t, struct as *,		\
 				caddr_t, size_t,			\
 				uchar_t, uchar_t, uint_t, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_delmap)(vnode_t *, offset_t, struct as *,		\
 				caddr_t, size_t,			\
 				uint_t, uint_t, uint_t, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_poll)(vnode_t *, short, int, short *,		\
 				struct pollhead **,			\
 				caller_context_t *);			\
 	int	(*vop_dump)(vnode_t *, caddr_t, offset_t, offset_t,	\
 				caller_context_t *);			\
 	int	(*vop_pathconf)(vnode_t *, int, ulong_t *, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_pageio)(vnode_t *, struct page *,			\
 				u_offset_t, size_t, int, cred_t *,	\
 				caller_context_t *);			\
 	int	(*vop_dumpctl)(vnode_t *, int, offset_t *,		\
 				caller_context_t *);			\
 	void	(*vop_dispose)(vnode_t *, struct page *,		\
 				int, int, cred_t *,			\
 				caller_context_t *);			\
 	int	(*vop_setsecattr)(vnode_t *, vsecattr_t *,		\
 				int, cred_t *, caller_context_t *);	\
 	int	(*vop_getsecattr)(vnode_t *, vsecattr_t *,		\
 				int, cred_t *, caller_context_t *);	\
 	int	(*vop_shrlock)(vnode_t *, int, struct shrlock *,	\
 				int, cred_t *, caller_context_t *);	\
 	int	(*vop_vnevent)(vnode_t *, vnevent_t, vnode_t *,		\
 				char *, caller_context_t *);		\
 	int	(*vop_reqzcbuf)(vnode_t *, enum uio_rw, xuio_t *,	\
 				cred_t *, caller_context_t *);		\
 	int	(*vop_retzcbuf)(vnode_t *, xuio_t *, cred_t *,		\
 				caller_context_t *)
 	/* NB: No ";" */
 
 /*
  * Operations on vnodes.  Note: File systems must never operate directly
  * on a 'vnodeops' structure -- it WILL change in future releases!  They
  * must use vn_make_ops() to create the structure.
  */
 typedef struct vnodeops {
 	const char *vnop_name;
 	VNODE_OPS;	/* Signatures of all vnode operations (vops) */
 } vnodeops_t;
 
 typedef int (*fs_generic_func_p) ();	/* Generic vop/vfsop/femop/fsemop ptr */
 
 extern int	fop_open(vnode_t **, int, cred_t *, caller_context_t *);
 extern int	fop_close(vnode_t *, int, int, offset_t, cred_t *,
 				caller_context_t *);
 extern int	fop_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
 extern int	fop_write(vnode_t *, uio_t *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 				caller_context_t *);
 extern int	fop_setfl(vnode_t *, int, int, cred_t *, caller_context_t *);
 extern int	fop_getattr(vnode_t *, vattr_t *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_setattr(vnode_t *, vattr_t *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 extern int	fop_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
 				int, vnode_t *, cred_t *, caller_context_t *,
 				int *, struct pathname *);
 extern int	fop_create(vnode_t *, char *, vattr_t *, vcexcl_t, int,
 				vnode_t **, cred_t *, int, caller_context_t *,
 				vsecattr_t *);
 extern int	fop_remove(vnode_t *vp, char *, cred_t *, caller_context_t *,
 				int);
 extern int	fop_link(vnode_t *, vnode_t *, char *, cred_t *,
 				caller_context_t *, int);
 extern int	fop_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 				caller_context_t *, int);
 extern int	fop_mkdir(vnode_t *, char *, vattr_t *, vnode_t **, cred_t *,
 				caller_context_t *, int, vsecattr_t *);
 extern int	fop_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 				caller_context_t *, int);
 extern int	fop_readdir(vnode_t *, uio_t *, cred_t *, int *,
 				caller_context_t *, int);
 extern int	fop_symlink(vnode_t *, char *, vattr_t *, char *, cred_t *,
 				caller_context_t *, int);
 extern int	fop_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
 extern int	fop_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 extern void	fop_inactive(vnode_t *, cred_t *, caller_context_t *);
 extern int	fop_fid(vnode_t *, struct fid *, caller_context_t *);
 extern int	fop_rwlock(vnode_t *, int, caller_context_t *);
 extern void	fop_rwunlock(vnode_t *, int, caller_context_t *);
 extern int	fop_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 extern int	fop_cmp(vnode_t *, vnode_t *, caller_context_t *);
 extern int	fop_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 				struct flk_callback *, cred_t *,
 				caller_context_t *);
 extern int	fop_space(vnode_t *, int, struct flock64 *, int, offset_t,
 				cred_t *, caller_context_t *);
 extern int	fop_realvp(vnode_t *, vnode_t **, caller_context_t *);
 extern int	fop_getpage(vnode_t *, offset_t, size_t, uint_t *,
 				struct page **, size_t, struct seg *,
 				caddr_t, enum seg_rw, cred_t *,
 				caller_context_t *);
 extern int	fop_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 				caller_context_t *);
 extern int	fop_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 				uchar_t, uchar_t, uint_t, cred_t *cr,
 				caller_context_t *);
 extern int	fop_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 				uchar_t, uchar_t, uint_t, cred_t *,
 				caller_context_t *);
 extern int	fop_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 				uint_t, uint_t, uint_t, cred_t *,
 				caller_context_t *);
 extern int	fop_poll(vnode_t *, short, int, short *, struct pollhead **,
 				caller_context_t *);
 extern int	fop_dump(vnode_t *, caddr_t, offset_t, offset_t,
     caller_context_t *);
 extern int	fop_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 				caller_context_t *);
 extern int	fop_pageio(vnode_t *, struct page *, u_offset_t, size_t, int,
 				cred_t *, caller_context_t *);
 extern int	fop_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 extern void	fop_dispose(vnode_t *, struct page *, int, int, cred_t *,
 				caller_context_t *);
 extern int	fop_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 				caller_context_t *);
 extern int	fop_vnevent(vnode_t *, vnevent_t, vnode_t *, char *,
 				caller_context_t *);
 extern int	fop_reqzcbuf(vnode_t *, enum uio_rw, xuio_t *, cred_t *,
 				caller_context_t *);
 extern int	fop_retzcbuf(vnode_t *, xuio_t *, cred_t *, caller_context_t *);
 
 #endif	/* _KERNEL */
 
 #define	VOP_OPEN(vpp, mode, cr, ct) \
 	fop_open(vpp, mode, cr, ct)
 #define	VOP_CLOSE(vp, f, c, o, cr, ct) \
 	fop_close(vp, f, c, o, cr, ct)
 #define	VOP_READ(vp, uiop, iof, cr, ct) \
 	fop_read(vp, uiop, iof, cr, ct)
 #define	VOP_WRITE(vp, uiop, iof, cr, ct) \
 	fop_write(vp, uiop, iof, cr, ct)
 #define	VOP_IOCTL(vp, cmd, a, f, cr, rvp, ct) \
 	fop_ioctl(vp, cmd, a, f, cr, rvp, ct)
 #define	VOP_SETFL(vp, f, a, cr, ct) \
 	fop_setfl(vp, f, a, cr, ct)
 #define	VOP_GETATTR(vp, vap, f, cr, ct) \
 	fop_getattr(vp, vap, f, cr, ct)
 #define	VOP_SETATTR(vp, vap, f, cr, ct) \
 	fop_setattr(vp, vap, f, cr, ct)
 #define	VOP_ACCESS(vp, mode, f, cr, ct) \
 	fop_access(vp, mode, f, cr, ct)
 #define	VOP_LOOKUP(vp, cp, vpp, pnp, f, rdir, cr, ct, defp, rpnp) \
 	fop_lookup(vp, cp, vpp, pnp, f, rdir, cr, ct, defp, rpnp)
 #define	VOP_CREATE(dvp, p, vap, ex, mode, vpp, cr, flag, ct, vsap) \
 	fop_create(dvp, p, vap, ex, mode, vpp, cr, flag, ct, vsap)
 #define	VOP_REMOVE(dvp, p, cr, ct, f) \
 	fop_remove(dvp, p, cr, ct, f)
 #define	VOP_LINK(tdvp, fvp, p, cr, ct, f) \
 	fop_link(tdvp, fvp, p, cr, ct, f)
 #define	VOP_RENAME(fvp, fnm, tdvp, tnm, cr, ct, f) \
 	fop_rename(fvp, fnm, tdvp, tnm, cr, ct, f)
 #define	VOP_MKDIR(dp, p, vap, vpp, cr, ct, f, vsap) \
 	fop_mkdir(dp, p, vap, vpp, cr, ct, f, vsap)
 #define	VOP_RMDIR(dp, p, cdir, cr, ct, f) \
 	fop_rmdir(dp, p, cdir, cr, ct, f)
 #define	VOP_READDIR(vp, uiop, cr, eofp, ct, f) \
 	fop_readdir(vp, uiop, cr, eofp, ct, f)
 #define	VOP_SYMLINK(dvp, lnm, vap, tnm, cr, ct, f) \
 	fop_symlink(dvp, lnm, vap, tnm, cr, ct, f)
 #define	VOP_READLINK(vp, uiop, cr, ct) \
 	fop_readlink(vp, uiop, cr, ct)
 #define	VOP_FSYNC(vp, syncflag, cr, ct) \
 	fop_fsync(vp, syncflag, cr, ct)
 #define	VOP_INACTIVE(vp, cr, ct) \
 	fop_inactive(vp, cr, ct)
 #define	VOP_FID(vp, fidp, ct) \
 	fop_fid(vp, fidp, ct)
 #define	VOP_RWLOCK(vp, w, ct) \
 	fop_rwlock(vp, w, ct)
 #define	VOP_RWUNLOCK(vp, w, ct) \
 	fop_rwunlock(vp, w, ct)
 #define	VOP_SEEK(vp, ooff, noffp, ct) \
 	fop_seek(vp, ooff, noffp, ct)
 #define	VOP_CMP(vp1, vp2, ct) \
 	fop_cmp(vp1, vp2, ct)
 #define	VOP_FRLOCK(vp, cmd, a, f, o, cb, cr, ct) \
 	fop_frlock(vp, cmd, a, f, o, cb, cr, ct)
 #define	VOP_SPACE(vp, cmd, a, f, o, cr, ct) \
 	fop_space(vp, cmd, a, f, o, cr, ct)
 #define	VOP_REALVP(vp1, vp2, ct) \
 	fop_realvp(vp1, vp2, ct)
 #define	VOP_GETPAGE(vp, of, sz, pr, pl, ps, sg, a, rw, cr, ct) \
 	fop_getpage(vp, of, sz, pr, pl, ps, sg, a, rw, cr, ct)
 #define	VOP_PUTPAGE(vp, of, sz, fl, cr, ct) \
 	fop_putpage(vp, of, sz, fl, cr, ct)
 #define	VOP_MAP(vp, of, as, a, sz, p, mp, fl, cr, ct) \
 	fop_map(vp, of, as, a, sz, p, mp, fl, cr, ct)
 #define	VOP_ADDMAP(vp, of, as, a, sz, p, mp, fl, cr, ct) \
 	fop_addmap(vp, of, as, a, sz, p, mp, fl, cr, ct)
 #define	VOP_DELMAP(vp, of, as, a, sz, p, mp, fl, cr, ct) \
 	fop_delmap(vp, of, as, a, sz, p, mp, fl, cr, ct)
 #define	VOP_POLL(vp, events, anyyet, reventsp, phpp, ct) \
 	fop_poll(vp, events, anyyet, reventsp, phpp, ct)
 #define	VOP_DUMP(vp, addr, bn, count, ct) \
 	fop_dump(vp, addr, bn, count, ct)
 #define	VOP_PATHCONF(vp, cmd, valp, cr, ct) \
 	fop_pathconf(vp, cmd, valp, cr, ct)
 #define	VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct) \
 	fop_pageio(vp, pp, io_off, io_len, flags, cr, ct)
 #define	VOP_DUMPCTL(vp, action, blkp, ct) \
 	fop_dumpctl(vp, action, blkp, ct)
 #define	VOP_DISPOSE(vp, pp, flag, dn, cr, ct) \
 	fop_dispose(vp, pp, flag, dn, cr, ct)
 #define	VOP_GETSECATTR(vp, vsap, f, cr, ct) \
 	fop_getsecattr(vp, vsap, f, cr, ct)
 #define	VOP_SETSECATTR(vp, vsap, f, cr, ct) \
 	fop_setsecattr(vp, vsap, f, cr, ct)
 #define	VOP_SHRLOCK(vp, cmd, shr, f, cr, ct) \
 	fop_shrlock(vp, cmd, shr, f, cr, ct)
 #define	VOP_VNEVENT(vp, vnevent, dvp, fnm, ct) \
 	fop_vnevent(vp, vnevent, dvp, fnm, ct)
 #define	VOP_REQZCBUF(vp, rwflag, xuiop, cr, ct) \
 	fop_reqzcbuf(vp, rwflag, xuiop, cr, ct)
 #define	VOP_RETZCBUF(vp, xuiop, cr, ct) \
 	fop_retzcbuf(vp, xuiop, cr, ct)
 
 #define	VOPNAME_OPEN		"open"
 #define	VOPNAME_CLOSE		"close"
 #define	VOPNAME_READ		"read"
 #define	VOPNAME_WRITE		"write"
 #define	VOPNAME_IOCTL		"ioctl"
 #define	VOPNAME_SETFL		"setfl"
 #define	VOPNAME_GETATTR		"getattr"
 #define	VOPNAME_SETATTR		"setattr"
 #define	VOPNAME_ACCESS		"access"
 #define	VOPNAME_LOOKUP		"lookup"
 #define	VOPNAME_CREATE		"create"
 #define	VOPNAME_REMOVE		"remove"
 #define	VOPNAME_LINK		"link"
 #define	VOPNAME_RENAME		"rename"
 #define	VOPNAME_MKDIR		"mkdir"
 #define	VOPNAME_RMDIR		"rmdir"
 #define	VOPNAME_READDIR		"readdir"
 #define	VOPNAME_SYMLINK		"symlink"
 #define	VOPNAME_READLINK	"readlink"
 #define	VOPNAME_FSYNC		"fsync"
 #define	VOPNAME_INACTIVE	"inactive"
 #define	VOPNAME_FID		"fid"
 #define	VOPNAME_RWLOCK		"rwlock"
 #define	VOPNAME_RWUNLOCK	"rwunlock"
 #define	VOPNAME_SEEK		"seek"
 #define	VOPNAME_CMP		"cmp"
 #define	VOPNAME_FRLOCK		"frlock"
 #define	VOPNAME_SPACE		"space"
 #define	VOPNAME_REALVP		"realvp"
 #define	VOPNAME_GETPAGE		"getpage"
 #define	VOPNAME_PUTPAGE		"putpage"
 #define	VOPNAME_MAP		"map"
 #define	VOPNAME_ADDMAP		"addmap"
 #define	VOPNAME_DELMAP		"delmap"
 #define	VOPNAME_POLL		"poll"
 #define	VOPNAME_DUMP		"dump"
 #define	VOPNAME_PATHCONF	"pathconf"
 #define	VOPNAME_PAGEIO		"pageio"
 #define	VOPNAME_DUMPCTL		"dumpctl"
 #define	VOPNAME_DISPOSE		"dispose"
 #define	VOPNAME_GETSECATTR	"getsecattr"
 #define	VOPNAME_SETSECATTR	"setsecattr"
 #define	VOPNAME_SHRLOCK		"shrlock"
 #define	VOPNAME_VNEVENT		"vnevent"
 #define	VOPNAME_REQZCBUF	"reqzcbuf"
 #define	VOPNAME_RETZCBUF	"retzcbuf"
 
 /*
  * Flags for VOP_LOOKUP
  *
  * Defined in file.h, but also possible, FIGNORECASE and FSEARCH
  *
  */
 #define	LOOKUP_DIR		0x01	/* want parent dir vp */
 #define	LOOKUP_XATTR		0x02	/* lookup up extended attr dir */
 #define	CREATE_XATTR_DIR	0x04	/* Create extended attr dir */
 #define	LOOKUP_HAVE_SYSATTR_DIR	0x08	/* Already created virtual GFS dir */
 
 /*
  * Flags for VOP_READDIR
  */
 #define	V_RDDIR_ENTFLAGS	0x01	/* request dirent flags */
 #define	V_RDDIR_ACCFILTER	0x02	/* filter out inaccessible dirents */
 
 /*
  * Flags for VOP_RWLOCK/VOP_RWUNLOCK
  * VOP_RWLOCK will return the flag that was actually set, or -1 if none.
  */
 #define	V_WRITELOCK_TRUE	(1)	/* Request write-lock on the vnode */
 #define	V_WRITELOCK_FALSE	(0)	/* Request read-lock on the vnode */
 
 /*
  * Flags for VOP_DUMPCTL
  */
 #define	DUMP_ALLOC	0
 #define	DUMP_FREE	1
 #define	DUMP_SCAN	2
 
 /*
  * Public vnode manipulation functions.
  */
 #ifdef	_KERNEL
 
 vnode_t *vn_alloc(int);
 void	vn_reinit(vnode_t *);
 void	vn_recycle(vnode_t *);
 void	vn_free(vnode_t *);
 
 int	vn_is_readonly(vnode_t *);
 int   	vn_is_opened(vnode_t *, v_mode_t);
 int   	vn_is_mapped(vnode_t *, v_mode_t);
 int   	vn_has_other_opens(vnode_t *, v_mode_t);
 void	vn_open_upgrade(vnode_t *, int);
 void	vn_open_downgrade(vnode_t *, int);
 
 int	vn_can_change_zones(vnode_t *vp);
 
 int	vn_has_flocks(vnode_t *);
 int	vn_has_mandatory_locks(vnode_t *, int);
 int	vn_has_cached_data(vnode_t *);
 
 void	vn_setops(vnode_t *, vnodeops_t *);
 vnodeops_t *vn_getops(vnode_t *);
 int	vn_matchops(vnode_t *, vnodeops_t *);
 int	vn_matchopval(vnode_t *, char *, fs_generic_func_p);
 int	vn_ismntpt(vnode_t *);
 
 struct vfs *vn_mountedvfs(vnode_t *);
 
 int	vn_in_dnlc(vnode_t *);
 
 void	vn_create_cache(void);
 void	vn_destroy_cache(void);
 
 void	vn_freevnodeops(vnodeops_t *);
 
 int	vn_open(char *pnamep, enum uio_seg seg, int filemode, int createmode,
 		struct vnode **vpp, enum create crwhy, mode_t umask);
 int	vn_openat(char *pnamep, enum uio_seg seg, int filemode, int createmode,
 		struct vnode **vpp, enum create crwhy,
 		mode_t umask, struct vnode *startvp, int fd);
 int	vn_create(char *pnamep, enum uio_seg seg, struct vattr *vap,
 		enum vcexcl excl, int mode, struct vnode **vpp,
 		enum create why, int flag, mode_t umask);
 int	vn_createat(char *pnamep, enum uio_seg seg, struct vattr *vap,
 		enum vcexcl excl, int mode, struct vnode **vpp,
 		enum create why, int flag, mode_t umask, struct vnode *startvp);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len,
 		offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit,
 		cred_t *cr, ssize_t *residp);
 void	vn_rele(struct vnode *vp);
 void	vn_rele_async(struct vnode *vp, struct taskq *taskq);
 void	vn_rele_dnlc(struct vnode *vp);
 void	vn_rele_stream(struct vnode *vp);
 int	vn_link(char *from, char *to, enum uio_seg seg);
 int	vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
 		vnode_t *tstartvp, char *to, enum uio_seg seg);
 int	vn_rename(char *from, char *to, enum uio_seg seg);
 int	vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, char *tname,
 		enum uio_seg seg);
 int	vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag);
 int	vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg,
 		enum rm dirflag);
 int	vn_compare(vnode_t *vp1, vnode_t *vp2);
 int	vn_vfswlock(struct vnode *vp);
 int	vn_vfswlock_wait(struct vnode *vp);
 int	vn_vfsrlock(struct vnode *vp);
 int	vn_vfsrlock_wait(struct vnode *vp);
 void	vn_vfsunlock(struct vnode *vp);
 int	vn_vfswlock_held(struct vnode *vp);
 vnode_t *specvp(struct vnode *vp, dev_t dev, vtype_t type, struct cred *cr);
 vnode_t *makespecvp(dev_t dev, vtype_t type);
 vn_vfslocks_entry_t *vn_vfslocks_getlock(void *);
 void	vn_vfslocks_rele(vn_vfslocks_entry_t *);
 boolean_t vn_is_reparse(vnode_t *, cred_t *, caller_context_t *);
 
 void vn_copypath(struct vnode *src, struct vnode *dst);
 void vn_setpath_str(struct vnode *vp, const char *str, size_t len);
 void vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
     const char *path, size_t plen);
 void vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len);
 
 /* Vnode event notification */
 void	vnevent_rename_src(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_create(vnode_t *, caller_context_t *);
 void	vnevent_link(vnode_t *, caller_context_t *);
 void	vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct);
 void	vnevent_mountedover(vnode_t *, caller_context_t *);
 void	vnevent_truncate(vnode_t *, caller_context_t *);
 int	vnevent_support(vnode_t *, caller_context_t *);
 void	vnevent_pre_rename_src(vnode_t *, vnode_t *, char *,
 	    caller_context_t *);
 void	vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *,
 	    caller_context_t *);
 void	vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *,
 	    caller_context_t *);
 
 /* Vnode specific data */
 void vsd_create(uint_t *, void (*)(void *));
 void vsd_destroy(uint_t *);
 void *vsd_get(vnode_t *, uint_t);
 int vsd_set(vnode_t *, uint_t, void *);
 void vsd_free(vnode_t *);
 
 /*
  * Extensible vnode attribute (xva) routines:
  * xva_init() initializes an xvattr_t (zero struct, init mapsize, set AT_XATTR)
  * xva_getxoptattr() returns a ponter to the xoptattr_t section of xvattr_t
  */
 void		xva_init(xvattr_t *);
 xoptattr_t	*xva_getxoptattr(xvattr_t *);	/* Get ptr to xoptattr_t */
 
 void xattr_init(void);		/* Initialize vnodeops for xattrs */
 
 /* GFS tunnel for xattrs */
 int xattr_dir_lookup(vnode_t *, vnode_t **, int, cred_t *);
 
 /* Reparse Point */
 void reparse_point_init(void);
 
 /* Context identification */
 u_longlong_t	fs_new_caller_id();
 
 int	vn_vmpss_usepageio(vnode_t *);
 
 /*
  * Needed for use of IS_VMODSORT() in kernel.
  */
 extern uint_t pvn_vmodsort_supported;
 
-#define	VN_HOLD(vp)	{ \
-	mutex_enter(&(vp)->v_lock); \
-	(vp)->v_count++; \
-	mutex_exit(&(vp)->v_lock); \
+/*
+ * All changes to v_count should be done through VN_HOLD() or VN_RELE(), or
+ * one of their variants. This makes it possible to ensure proper locking,
+ * and to guarantee that all modifications are accompanied by a firing of
+ * the vn-hold or vn-rele SDT DTrace probe.
+ *
+ * Example DTrace command for tracing vnode references using these probes:
+ *
+ * dtrace -q -n 'sdt:::vn-hold,sdt:::vn-rele
+ * {
+ *	this->vp = (vnode_t *)arg0;
+ *	printf("%s %s(%p[%s]) %d\n", execname, probename, this->vp,
+ *	    this->vp->v_path == NULL ? "NULL" : stringof(this->vp->v_path),
+ *	    this->vp->v_count)
+ * }'
+ */
+#define	VN_HOLD_LOCKED(vp) {			\
+	ASSERT(mutex_owned(&(vp)->v_lock));	\
+	(vp)->v_count++;			\
+	DTRACE_PROBE1(vn__hold, vnode_t *, vp);	\
 }
 
+#define	VN_HOLD(vp)	{		\
+	mutex_enter(&(vp)->v_lock);	\
+	VN_HOLD_LOCKED(vp);		\
+	mutex_exit(&(vp)->v_lock);	\
+}
+
 #define	VN_RELE(vp)	{ \
 	vn_rele(vp); \
 }
 
 #define	VN_RELE_ASYNC(vp, taskq)	{ \
 	vn_rele_async(vp, taskq); \
+}
+
+#define	VN_RELE_LOCKED(vp) {			\
+	ASSERT(mutex_owned(&(vp)->v_lock));	\
+	ASSERT((vp)->v_count >= 1);		\
+	(vp)->v_count--;			\
+	DTRACE_PROBE1(vn__rele, vnode_t *, vp);	\
 }
 
 #define	VN_SET_VFS_TYPE_DEV(vp, vfsp, type, dev)	{ \
 	(vp)->v_vfsp = (vfsp); \
 	(vp)->v_type = (type); \
 	(vp)->v_rdev = (dev); \
 }
 
 /*
  * Compare two vnodes for equality.  In general this macro should be used
  * in preference to calling VOP_CMP directly.
  */
 #define	VN_CMP(VP1, VP2)	((VP1) == (VP2) ? 1 : 	\
 	((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \
 	VOP_CMP(VP1, VP2, NULL) : 0))
 
 /*
  * Some well-known global vnodes used by the VM system to name pages.
  */
 extern struct vnode kvps[];
 
 typedef enum {
 	KV_KVP,		/* vnode for all segkmem pages */
 	KV_ZVP,		/* vnode for all ZFS pages */
 #if defined(__sparc)
 	KV_MPVP,	/* vnode for all page_t meta-pages */
 	KV_PROMVP,	/* vnode for all PROM pages */
 #endif	/* __sparc */
 	KV_MAX		/* total number of vnodes in kvps[] */
 } kvps_index_t;
 
 #define	VN_ISKAS(vp)	((vp) >= &kvps[0] && (vp) < &kvps[KV_MAX])
 
 #endif	/* _KERNEL */
 
 /*
  * Flags to VOP_SETATTR/VOP_GETATTR.
  */
 #define	ATTR_UTIME	0x01	/* non-default utime(2) request */
 #define	ATTR_EXEC	0x02	/* invocation from exec(2) */
 #define	ATTR_COMM	0x04	/* yield common vp attributes */
 #define	ATTR_HINT	0x08	/* information returned will be `hint' */
 #define	ATTR_REAL	0x10	/* yield attributes of the real vp */
 #define	ATTR_NOACLCHECK	0x20	/* Don't check ACL when checking permissions */
 #define	ATTR_TRIGGER	0x40	/* Mount first if vnode is a trigger mount */
 /*
  * Generally useful macros.
  */
 #define	VBSIZE(vp)	((vp)->v_vfsp->vfs_bsize)
 
 #define	VTOZONE(vp)	((vp)->v_vfsp->vfs_zone)
 
 #define	NULLVP		((struct vnode *)0)
 #define	NULLVPP		((struct vnode **)0)
 
 #ifdef	_KERNEL
 
 /*
  * Structure used while handling asynchronous VOP_PUTPAGE operations.
  */
 struct async_reqs {
 	struct async_reqs *a_next;	/* pointer to next arg struct */
 	struct vnode *a_vp;		/* vnode pointer */
 	u_offset_t a_off;			/* offset in file */
 	uint_t a_len;			/* size of i/o request */
 	int a_flags;			/* flags to indicate operation type */
 	struct cred *a_cred;		/* cred pointer	*/
 	ushort_t a_prealloced;		/* set if struct is pre-allocated */
 };
 
 /*
  * VN_DISPOSE() -- given a page pointer, safely invoke VOP_DISPOSE().
  * Note that there is no guarantee that the page passed in will be
  * freed.  If that is required, then a check after calling VN_DISPOSE would
  * be necessary to ensure the page was freed.
  */
 #define	VN_DISPOSE(pp, flag, dn, cr)	{ \
 	if ((pp)->p_vnode != NULL && !VN_ISKAS((pp)->p_vnode)) \
 		VOP_DISPOSE((pp)->p_vnode, (pp), (flag), (dn), (cr), NULL); \
 	else if ((flag) == B_FREE) \
 		page_free((pp), (dn)); \
 	else \
 		page_destroy((pp), (dn)); \
 	}
 
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_VNODE_H */