Index: vendor-sys/illumos/dist/uts/common/fs/vnode.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/vnode.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/vnode.c	(revision 354953)
@@ -1,4726 +1,4726 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2017, Joyent, Inc.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
  * The Regents of the University of California
  * All Rights Reserved
  *
  * University Acknowledgment- Portions of this document are derived from
  * software developed by the University of California, Berkeley, and its
  * contributors.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/t_lock.h>
 #include <sys/errno.h>
 #include <sys/cred.h>
 #include <sys/user.h>
 #include <sys/uio.h>
 #include <sys/file.h>
 #include <sys/pathname.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/vnode.h>
 #include <sys/rwstlock.h>
 #include <sys/fem.h>
 #include <sys/stat.h>
 #include <sys/mode.h>
 #include <sys/conf.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/systm.h>
 #include <sys/kmem.h>
 #include <sys/debug.h>
 #include <c2/audit.h>
 #include <sys/acl.h>
 #include <sys/nbmlock.h>
 #include <sys/fcntl.h>
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
 #include <sys/time.h>
 #include <sys/sdt.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
 
 /* Tunable via /etc/system; used only by admin/install */
 int nfs_global_client_only;
 
 /*
  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
  * number of entries as and parallel to the vfssw table.  (Arguably, it could
  * be part of the vfssw table.)  Once it's initialized, it's accessed using
  * the same fstype index that is used to index into the vfssw table.
  */
 vopstats_t **vopstats_fstype;
 
 /* vopstats initialization template used for fast initialization via bcopy() */
 static vopstats_t *vs_templatep;
 
 /* Kmem cache handle for vsk_anchor_t allocations */
 kmem_cache_t *vsk_anchor_cache;
 
 /* file events cleanup routine */
 extern void free_fopdata(vnode_t *);
 
 /*
  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
  * updates to vsktat_tree.
  */
 avl_tree_t	vskstat_tree;
 kmutex_t	vskstat_tree_lock;
 
 /* Global variable which enables/disables the vopstats collection */
 int vopstats_enabled = 1;
 
 /* Global used for empty/invalid v_path */
 char *vn_vpath_empty = "";
 
 /*
  * forward declarations for internal vnode specific data (vsd)
  */
 static void *vsd_realloc(void *, size_t, size_t);
 
 /*
  * forward declarations for reparse point functions
  */
 static int fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr);
 
 /*
  * VSD -- VNODE SPECIFIC DATA
  * The v_data pointer is typically used by a file system to store a
  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
  * However, there are times when additional project private data needs
  * to be stored separately from the data (node) pointed to by v_data.
  * This additional data could be stored by the file system itself or
  * by a completely different kernel entity.  VSD provides a way for
  * callers to obtain a key and store a pointer to private data associated
  * with a vnode.
  *
  * Callers are responsible for protecting the vsd by holding v_vsd_lock
  * for calls to vsd_set() and vsd_get().
  */
 
 /*
  * vsd_lock protects:
  *   vsd_nkeys - creation and deletion of vsd keys
  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
  *   vsd_destructor - adding and removing destructors to the list
  */
 static kmutex_t		vsd_lock;
 static uint_t		vsd_nkeys;	 /* size of destructor array */
 /* list of vsd_node's */
 static list_t *vsd_list = NULL;
 /* per-key destructor funcs */
 static void 		(**vsd_destructor)(void *);
 
 /*
  * The following is the common set of actions needed to update the
  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
  * recording of the bytes transferred.  Since the code is similar
  * but small, it is nearly a duplicate.  Consequently any changes
  * to one may need to be reflected in the other.
  * Rundown of the variables:
  * vp - Pointer to the vnode
  * counter - Partial name structure member to update in vopstats for counts
  * bytecounter - Partial name structure member to update in vopstats for bytes
  * bytesval - Value to update in vopstats for bytes
  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
  */
 
 #define	VOPSTATS_UPDATE(vp, counter) {					\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp && vfsp->vfs_implp &&					\
 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 		    size_t, uint64_t *);				\
 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
 		(*stataddr)++;						\
 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
 			vsp->n##counter.value.ui64++;			\
 		}							\
 	}								\
 }
 
 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp && vfsp->vfs_implp &&					\
 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
 		    size_t, uint64_t *);				\
 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 		(*stataddr)++;						\
 		vsp->bytecounter.value.ui64 += bytesval;		\
 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
 			vsp->n##counter.value.ui64++;			\
 			vsp->bytecounter.value.ui64 += bytesval;	\
 		}							\
 	}								\
 }
 
 /*
  * If the filesystem does not support XIDs map credential
  * If the vfsp is NULL, perhaps we should also map?
  */
 #define	VOPXID_MAP_CR(vp, cr)	{					\
 	vfs_t *vfsp = (vp)->v_vfsp;					\
 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
 		cr = crgetmapped(cr);					\
 	}
 
 /*
  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
  * numerical order of S_IFMT and vnode types.)
  */
 enum vtype iftovt_tab[] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 };
 
 ushort_t vttoif_tab[] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 };
 
 /*
  * The system vnode cache.
  */
 
 kmem_cache_t *vn_cache;
 
 
 /*
  * Vnode operations vector.
  */
 
 static const fs_operation_trans_def_t vn_ops_table[] = {
 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
 	    fs_setfl, fs_nosys,
 
 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
 	    fs_rwlock, fs_rwlock,
 
 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
 	    (fs_generic_func_p) fs_rwunlock,
 	    (fs_generic_func_p) fs_rwunlock,	/* no errors allowed */
 
 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
 	    fs_cmp, fs_cmp,		/* no errors allowed */
 
 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
 	    fs_frlock, fs_nosys,
 
 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
 	    (fs_generic_func_p) fs_nosys_map,
 	    (fs_generic_func_p) fs_nosys_map,
 
 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
 	    (fs_generic_func_p) fs_nosys_addmap,
 	    (fs_generic_func_p) fs_nosys_addmap,
 
 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
 
 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
 	    fs_pathconf, fs_nosys,
 
 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
 	    (fs_generic_func_p) fs_dispose,
 	    (fs_generic_func_p) fs_nodispose,
 
 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
 	    fs_fab_acl, fs_nosys,
 
 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
 	    fs_shrlock, fs_nosys,
 
 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 	    (fs_generic_func_p) fs_vnevent_nosupport,
 
 	VOPNAME_REQZCBUF, offsetof(struct vnodeops, vop_reqzcbuf),
 	    fs_nosys, fs_nosys,
 
 	VOPNAME_RETZCBUF, offsetof(struct vnodeops, vop_retzcbuf),
 	    fs_nosys, fs_nosys,
 
 	NULL, 0, NULL, NULL
 };
 
 /* Extensible attribute (xva) routines. */
 
 /*
  * Zero out the structure, set the size of the requested/returned bitmaps,
  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
  * to the returned attributes array.
  */
 void
 xva_init(xvattr_t *xvap)
 {
 	bzero(xvap, sizeof (xvattr_t));
 	xvap->xva_mapsize = XVA_MAPSIZE;
 	xvap->xva_magic = XVA_MAGIC;
 	xvap->xva_vattr.va_mask = AT_XVATTR;
 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
 }
 
 /*
  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
  * structure.  Otherwise, returns NULL.
  */
 xoptattr_t *
 xva_getxoptattr(xvattr_t *xvap)
 {
 	xoptattr_t *xoap = NULL;
 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
 		xoap = &xvap->xva_xoptattrs;
 	return (xoap);
 }
 
 /*
  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
  * kstat name.
  */
 static int
 vska_compar(const void *n1, const void *n2)
 {
 	int ret;
 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
 
 	if (p1 < p2) {
 		ret = -1;
 	} else if (p1 > p2) {
 		ret = 1;
 	} else {
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Used to create a single template which will be bcopy()ed to a newly
  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
  */
 static vopstats_t *
 create_vopstats_template()
 {
 	vopstats_t		*vsp;
 
 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
 
 	/* VOP_OPEN */
 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
 	/* VOP_CLOSE */
 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
 	/* VOP_READ I/O */
 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
 	/* VOP_WRITE I/O */
 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
 	/* VOP_IOCTL */
 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
 	/* VOP_SETFL */
 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
 	/* VOP_GETATTR */
 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
 	/* VOP_SETATTR */
 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
 	/* VOP_ACCESS */
 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
 	/* VOP_LOOKUP */
 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
 	/* VOP_CREATE */
 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
 	/* VOP_REMOVE */
 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
 	/* VOP_LINK */
 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
 	/* VOP_RENAME */
 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
 	/* VOP_MKDIR */
 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
 	/* VOP_RMDIR */
 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
 	/* VOP_READDIR I/O */
 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
 	    KSTAT_DATA_UINT64);
 	/* VOP_SYMLINK */
 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
 	/* VOP_READLINK */
 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
 	/* VOP_FSYNC */
 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
 	/* VOP_INACTIVE */
 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
 	/* VOP_FID */
 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
 	/* VOP_RWLOCK */
 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
 	/* VOP_RWUNLOCK */
 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
 	/* VOP_SEEK */
 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
 	/* VOP_CMP */
 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
 	/* VOP_FRLOCK */
 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
 	/* VOP_SPACE */
 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
 	/* VOP_REALVP */
 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
 	/* VOP_GETPAGE */
 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
 	/* VOP_PUTPAGE */
 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
 	/* VOP_MAP */
 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
 	/* VOP_ADDMAP */
 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
 	/* VOP_DELMAP */
 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
 	/* VOP_POLL */
 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
 	/* VOP_DUMP */
 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
 	/* VOP_PATHCONF */
 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
 	/* VOP_PAGEIO */
 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
 	/* VOP_DUMPCTL */
 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
 	/* VOP_DISPOSE */
 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
 	/* VOP_SETSECATTR */
 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
 	/* VOP_GETSECATTR */
 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
 	/* VOP_SHRLOCK */
 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
 	/* VOP_VNEVENT */
 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
 	/* VOP_REQZCBUF */
 	kstat_named_init(&vsp->nreqzcbuf, "nreqzcbuf", KSTAT_DATA_UINT64);
 	/* VOP_RETZCBUF */
 	kstat_named_init(&vsp->nretzcbuf, "nretzcbuf", KSTAT_DATA_UINT64);
 
 	return (vsp);
 }
 
 /*
  * Creates a kstat structure associated with a vopstats structure.
  */
 kstat_t *
 new_vskstat(char *ksname, vopstats_t *vsp)
 {
 	kstat_t		*ksp;
 
 	if (!vopstats_enabled) {
 		return (NULL);
 	}
 
 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
 	if (ksp) {
 		ksp->ks_data = vsp;
 		kstat_install(ksp);
 	}
 
 	return (ksp);
 }
 
 /*
  * Called from vfsinit() to initialize the support mechanisms for vopstats
  */
 void
 vopstats_startup()
 {
 	if (!vopstats_enabled)
 		return;
 
 	/*
 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
 	 * is necessary since we need to check if a kstat exists before we
 	 * attempt to create it.  Also, initialize its lock.
 	 */
 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
 	    offsetof(vsk_anchor_t, vsk_node));
 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
 	    NULL, NULL, 0);
 
 	/*
 	 * Set up the array of pointers for the vopstats-by-FS-type.
 	 * The entries will be allocated/initialized as each file system
 	 * goes through modload/mod_installfs.
 	 */
 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
 
 	/* Set up the global vopstats initialization template */
 	vs_templatep = create_vopstats_template();
 }
 
 /*
  * We need to have the all of the counters zeroed.
  * The initialization of the vopstats_t includes on the order of
  * 50 calls to kstat_named_init().  Rather that do that on every call,
  * we do it once in a template (vs_templatep) then bcopy it over.
  */
 void
 initialize_vopstats(vopstats_t *vsp)
 {
 	if (vsp == NULL)
 		return;
 
 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
 }
 
 /*
  * If possible, determine which vopstats by fstype to use and
  * return a pointer to the caller.
  */
 vopstats_t *
 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
 {
 	int		fstype = 0;	/* Index into vfssw[] */
 	vopstats_t	*vsp = NULL;
 
 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
 	    !vopstats_enabled)
 		return (NULL);
 	/*
 	 * Set up the fstype.  We go to so much trouble because all versions
 	 * of NFS use the same fstype in their vfs even though they have
 	 * distinct entries in the vfssw[] table.
 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
 	 */
 	if (vswp) {
 		fstype = vswp - vfssw;	/* Gets us the index */
 	} else {
 		fstype = vfsp->vfs_fstype;
 	}
 
 	/*
 	 * Point to the per-fstype vopstats. The only valid values are
 	 * non-zero positive values less than the number of vfssw[] table
 	 * entries.
 	 */
 	if (fstype > 0 && fstype < nfstype) {
 		vsp = vopstats_fstype[fstype];
 	}
 
 	return (vsp);
 }
 
 /*
  * Generate a kstat name, create the kstat structure, and allocate a
  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
  * to the caller.  This must only be called from a mount.
  */
 vsk_anchor_t *
 get_vskstat_anchor(vfs_t *vfsp)
 {
 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
 	kstat_t		*ksp;			/* Ptr to new kstat */
 	avl_index_t	where;			/* Location in the AVL tree */
 
 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 		return (NULL);
 
 	/* Need to get the fsid to build a kstat name */
 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
 		/* Create a name for our kstats based on fsid */
 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
 		    VOPSTATS_STR, statvfsbuf.f_fsid);
 
 		/* Allocate and initialize the vsk_anchor_t */
 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
 		bzero(vskp, sizeof (*vskp));
 		vskp->vsk_fsid = statvfsbuf.f_fsid;
 
 		mutex_enter(&vskstat_tree_lock);
 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
 			avl_insert(&vskstat_tree, vskp, where);
 			mutex_exit(&vskstat_tree_lock);
 
 			/*
 			 * Now that we've got the anchor in the AVL
 			 * tree, we can create the kstat.
 			 */
 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
 			if (ksp) {
 				vskp->vsk_ksp = ksp;
 			}
 		} else {
 			/* Oops, found one! Release memory and lock. */
 			mutex_exit(&vskstat_tree_lock);
 			kmem_cache_free(vsk_anchor_cache, vskp);
 			vskp = NULL;
 		}
 	}
 	return (vskp);
 }
 
 /*
  * We're in the process of tearing down the vfs and need to cleanup
  * the data structures associated with the vopstats. Must only be called
  * from dounmount().
  */
 void
 teardown_vopstats(vfs_t *vfsp)
 {
 	vsk_anchor_t	*vskap;
 	avl_index_t	where;
 
 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
 		return;
 
 	/* This is a safe check since VFS_STATS must be set (see above) */
 	if ((vskap = vfsp->vfs_vskap) == NULL)
 		return;
 
 	/* Whack the pointer right away */
 	vfsp->vfs_vskap = NULL;
 
 	/* Lock the tree, remove the node, and delete the kstat */
 	mutex_enter(&vskstat_tree_lock);
 	if (avl_find(&vskstat_tree, vskap, &where)) {
 		avl_remove(&vskstat_tree, vskap);
 	}
 
 	if (vskap->vsk_ksp) {
 		kstat_delete(vskap->vsk_ksp);
 	}
 	mutex_exit(&vskstat_tree_lock);
 
 	kmem_cache_free(vsk_anchor_cache, vskap);
 }
 
 /*
  * Read or write a vnode.  Called from kernel code.
  */
 int
 vn_rdwr(
 	enum uio_rw rw,
 	struct vnode *vp,
 	caddr_t base,
 	ssize_t len,
 	offset_t offset,
 	enum uio_seg seg,
 	int ioflag,
 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
 	cred_t *cr,
 	ssize_t *residp)
 {
 	struct uio uio;
 	struct iovec iov;
 	int error;
 	int in_crit = 0;
 
 	if (rw == UIO_WRITE && ISROFILE(vp))
 		return (EROFS);
 
 	if (len < 0)
 		return (EIO);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	iov.iov_base = base;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_loffset = offset;
 	uio.uio_segflg = (short)seg;
 	uio.uio_resid = len;
 	uio.uio_llimit = ulimit;
 
 	/*
 	 * We have to enter the critical region before calling VOP_RWLOCK
 	 * to avoid a deadlock with ufs.
 	 */
 	if (nbl_need_check(vp)) {
 		int svmand;
 
 		nbl_start_crit(vp, RW_READER);
 		in_crit = 1;
 		error = nbl_svmand(vp, cr, &svmand);
 		if (error != 0)
 			goto done;
 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
 			error = EACCES;
 			goto done;
 		}
 	}
 
 	(void) VOP_RWLOCK(vp,
 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 	if (rw == UIO_WRITE) {
 		uio.uio_fmode = FWRITE;
 		uio.uio_extflg = UIO_COPY_DEFAULT;
 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
 	} else {
 		uio.uio_fmode = FREAD;
 		uio.uio_extflg = UIO_COPY_CACHED;
 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
 	}
 	VOP_RWUNLOCK(vp,
 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
 	if (residp)
 		*residp = uio.uio_resid;
 	else if (uio.uio_resid)
 		error = EIO;
 
 done:
 	if (in_crit)
 		nbl_end_crit(vp);
 	return (error);
 }
 
 /*
  * Release a vnode.  Call VOP_INACTIVE on last reference or
  * decrement reference count.
  *
  * To avoid race conditions, the v_count is left at 1 for
  * the call to VOP_INACTIVE. This prevents another thread
  * from reclaiming and releasing the vnode *before* the
  * VOP_INACTIVE routine has a chance to destroy the vnode.
  * We can't have more than 1 thread calling VOP_INACTIVE
  * on a vnode.
  */
 void
 vn_rele(vnode_t *vp)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VOP_INACTIVE(vp, CRED(), NULL);
 		return;
 	}
 	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 /*
  * Release a vnode referenced by the DNLC. Multiple DNLC references are treated
  * as a single reference, so v_count is not decremented until the last DNLC hold
  * is released. This makes it possible to distinguish vnodes that are referenced
  * only by the DNLC.
  */
 void
 vn_rele_dnlc(vnode_t *vp)
 {
 	VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
 	mutex_enter(&vp->v_lock);
 	if (--vp->v_count_dnlc == 0) {
 		if (vp->v_count == 1) {
 			mutex_exit(&vp->v_lock);
 			VOP_INACTIVE(vp, CRED(), NULL);
 			return;
 		}
 		VN_RELE_LOCKED(vp);
 	}
 	mutex_exit(&vp->v_lock);
 }
 
 /*
  * Like vn_rele() except that it clears v_stream under v_lock.
  * This is used by sockfs when it dismantles the association between
  * the sockfs node and the vnode in the underlying file system.
  * v_lock has to be held to prevent a thread coming through the lookupname
  * path from accessing a stream head that is going away.
  */
 void
 vn_rele_stream(vnode_t *vp)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	vp->v_stream = NULL;
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VOP_INACTIVE(vp, CRED(), NULL);
 		return;
 	}
 	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 static void
 vn_rele_inactive(vnode_t *vp)
 {
 	VOP_INACTIVE(vp, CRED(), NULL);
 }
 
 /*
  * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
  * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
  * the file system as a result of releasing the vnode. Note, file systems
  * already have to handle the race where the vnode is incremented before the
  * inactive routine is called and does its locking.
  *
  * Warning: Excessive use of this routine can lead to performance problems.
  * This is because taskqs throttle back allocation if too many are created.
  */
 void
 vn_rele_async(vnode_t *vp, taskq_t *taskq)
 {
 	VERIFY(vp->v_count > 0);
 	mutex_enter(&vp->v_lock);
 	if (vp->v_count == 1) {
 		mutex_exit(&vp->v_lock);
 		VERIFY(taskq_dispatch(taskq, (task_func_t *)vn_rele_inactive,
-		    vp, TQ_SLEEP) != NULL);
+		    vp, TQ_SLEEP) != TASKQID_INVALID);
 		return;
 	}
 	VN_RELE_LOCKED(vp);
 	mutex_exit(&vp->v_lock);
 }
 
 int
 vn_open(
 	char *pnamep,
 	enum uio_seg seg,
 	int filemode,
 	int createmode,
 	struct vnode **vpp,
 	enum create crwhy,
 	mode_t umask)
 {
 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
 	    umask, NULL, -1));
 }
 
 
 /*
  * Open/create a vnode.
  * This may be callable by the kernel, the only known use
  * of user context being that the current user credentials
  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
  */
 int
 vn_openat(
 	char *pnamep,
 	enum uio_seg seg,
 	int filemode,
 	int createmode,
 	struct vnode **vpp,
 	enum create crwhy,
 	mode_t umask,
 	struct vnode *startvp,
 	int fd)
 {
 	struct vnode *vp;
 	int mode;
 	int accessflags;
 	int error;
 	int in_crit = 0;
 	int open_done = 0;
 	int shrlock_done = 0;
 	struct vattr vattr;
 	enum symfollow follow;
 	int estale_retry = 0;
 	struct shrlock shr;
 	struct shr_locowner shr_own;
 
 	mode = 0;
 	accessflags = 0;
 	if (filemode & FREAD)
 		mode |= VREAD;
 	if (filemode & (FWRITE|FTRUNC))
 		mode |= VWRITE;
 	if (filemode & (FSEARCH|FEXEC|FXATTRDIROPEN))
 		mode |= VEXEC;
 
 	/* symlink interpretation */
 	if (filemode & FNOFOLLOW)
 		follow = NO_FOLLOW;
 	else
 		follow = FOLLOW;
 
 	if (filemode & FAPPEND)
 		accessflags |= V_APPEND;
 
 top:
 	if (filemode & FCREAT) {
 		enum vcexcl excl;
 
 		/*
 		 * Wish to create a file.
 		 */
 		vattr.va_type = VREG;
 		vattr.va_mode = createmode;
 		vattr.va_mask = AT_TYPE|AT_MODE;
 		if (filemode & FTRUNC) {
 			vattr.va_size = 0;
 			vattr.va_mask |= AT_SIZE;
 		}
 		if (filemode & FEXCL)
 			excl = EXCL;
 		else
 			excl = NONEXCL;
 
 		if (error =
 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
 			return (error);
 	} else {
 		/*
 		 * Wish to open a file.  Just look it up.
 		 */
 		if (error = lookupnameat(pnamep, seg, follow,
 		    NULLVPP, &vp, startvp)) {
 			if ((error == ESTALE) &&
 			    fs_need_estale_retry(estale_retry++))
 				goto top;
 			return (error);
 		}
 
 		/*
 		 * Get the attributes to check whether file is large.
 		 * We do this only if the FOFFMAX flag is not set and
 		 * only for regular files.
 		 */
 
 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
 			vattr.va_mask = AT_SIZE;
 			if ((error = VOP_GETATTR(vp, &vattr, 0,
 			    CRED(), NULL))) {
 				goto out;
 			}
 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
 				/*
 				 * Large File API - regular open fails
 				 * if FOFFMAX flag is set in file mode
 				 */
 				error = EOVERFLOW;
 				goto out;
 			}
 		}
 		/*
 		 * Can't write directories, active texts, or
 		 * read-only filesystems.  Can't truncate files
 		 * on which mandatory locking is in effect.
 		 */
 		if (filemode & (FWRITE|FTRUNC)) {
 			/*
 			 * Allow writable directory if VDIROPEN flag is set.
 			 */
 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
 				error = EISDIR;
 				goto out;
 			}
 			if (ISROFILE(vp)) {
 				error = EROFS;
 				goto out;
 			}
 			/*
 			 * Can't truncate files on which
 			 * sysv mandatory locking is in effect.
 			 */
 			if (filemode & FTRUNC) {
 				vnode_t *rvp;
 
 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
 					rvp = vp;
 				if (rvp->v_filocks != NULL) {
 					vattr.va_mask = AT_MODE;
 					if ((error = VOP_GETATTR(vp,
 					    &vattr, 0, CRED(), NULL)) == 0 &&
 					    MANDLOCK(vp, vattr.va_mode))
 						error = EAGAIN;
 				}
 			}
 			if (error)
 				goto out;
 		}
 		/*
 		 * Check permissions.
 		 */
 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
 			goto out;
 		/*
 		 * Require FSEARCH to return a directory.
 		 * Require FEXEC to return a regular file.
 		 */
 		if ((filemode & FSEARCH) && vp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		}
 		if ((filemode & FEXEC) && vp->v_type != VREG) {
 			error = ENOEXEC;	/* XXX: error code? */
 			goto out;
 		}
 	}
 
 	/*
 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
 	 */
 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
 		error = ELOOP;
 		goto out;
 	}
 	if (filemode & FNOLINKS) {
 		vattr.va_mask = AT_NLINK;
 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
 			goto out;
 		}
 		if (vattr.va_nlink != 1) {
 			error = EMLINK;
 			goto out;
 		}
 	}
 
 	/*
 	 * Opening a socket corresponding to the AF_UNIX pathname
 	 * in the filesystem name space is not supported.
 	 * However, VSOCK nodes in namefs are supported in order
 	 * to make fattach work for sockets.
 	 *
 	 * XXX This uses VOP_REALVP to distinguish between
 	 * an unopened namefs node (where VOP_REALVP returns a
 	 * different VSOCK vnode) and a VSOCK created by vn_create
 	 * in some file system (where VOP_REALVP would never return
 	 * a different vnode).
 	 */
 	if (vp->v_type == VSOCK) {
 		struct vnode *nvp;
 
 		error = VOP_REALVP(vp, &nvp, NULL);
 		if (error != 0 || nvp == NULL || nvp == vp ||
 		    nvp->v_type != VSOCK) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 
 	if ((vp->v_type == VREG) && nbl_need_check(vp)) {
 		/* get share reservation */
 		shr.s_access = 0;
 		if (filemode & FWRITE)
 			shr.s_access |= F_WRACC;
 		if (filemode & FREAD)
 			shr.s_access |= F_RDACC;
 		shr.s_deny = 0;
 		shr.s_sysid = 0;
 		shr.s_pid = ttoproc(curthread)->p_pid;
 		shr_own.sl_pid = shr.s_pid;
 		shr_own.sl_id = fd;
 		shr.s_own_len = sizeof (shr_own);
 		shr.s_owner = (caddr_t)&shr_own;
 		error = VOP_SHRLOCK(vp, F_SHARE_NBMAND, &shr, filemode, CRED(),
 		    NULL);
 		if (error)
 			goto out;
 		shrlock_done = 1;
 
 		/* nbmand conflict check if truncating file */
 		if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
 			nbl_start_crit(vp, RW_READER);
 			in_crit = 1;
 
 			vattr.va_mask = AT_SIZE;
 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
 				goto out;
 			if (nbl_conflict(vp, NBL_WRITE, 0, vattr.va_size, 0,
 			    NULL)) {
 				error = EACCES;
 				goto out;
 			}
 		}
 	}
 
 	/*
 	 * Do opening protocol.
 	 */
 	error = VOP_OPEN(&vp, filemode, CRED(), NULL);
 	if (error)
 		goto out;
 	open_done = 1;
 
 	/*
 	 * Truncate if required.
 	 */
 	if ((filemode & FTRUNC) && !(filemode & FCREAT)) {
 		vattr.va_size = 0;
 		vattr.va_mask = AT_SIZE;
 		if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
 			goto out;
 	}
 out:
 	ASSERT(vp->v_count > 0);
 
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (error) {
 		if (open_done) {
 			(void) VOP_CLOSE(vp, filemode, 1, (offset_t)0, CRED(),
 			    NULL);
 			open_done = 0;
 			shrlock_done = 0;
 		}
 		if (shrlock_done) {
 			(void) VOP_SHRLOCK(vp, F_UNSHARE, &shr, 0, CRED(),
 			    NULL);
 			shrlock_done = 0;
 		}
 
 		/*
 		 * The following clause was added to handle a problem
 		 * with NFS consistency.  It is possible that a lookup
 		 * of the file to be opened succeeded, but the file
 		 * itself doesn't actually exist on the server.  This
 		 * is chiefly due to the DNLC containing an entry for
 		 * the file which has been removed on the server.  In
 		 * this case, we just start over.  If there was some
 		 * other cause for the ESTALE error, then the lookup
 		 * of the file will fail and the error will be returned
 		 * above instead of looping around from here.
 		 */
 		VN_RELE(vp);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 	} else
 		*vpp = vp;
 	return (error);
 }
 
 /*
  * The following two accessor functions are for the NFSv4 server.  Since there
  * is no VOP_OPEN_UP/DOWNGRADE we need a way for the NFS server to keep the
  * vnode open counts correct when a client "upgrades" an open or does an
  * open_downgrade.  In NFS, an upgrade or downgrade can not only change the
  * open mode (add or subtract read or write), but also change the share/deny
  * modes.  However, share reservations are not integrated with OPEN, yet, so
  * we need to handle each separately.  These functions are cleaner than having
  * the NFS server manipulate the counts directly, however, nobody else should
  * use these functions.
  */
 void
 vn_open_upgrade(
 	vnode_t *vp,
 	int filemode)
 {
 	ASSERT(vp->v_type == VREG);
 
 	if (filemode & FREAD)
 		atomic_inc_32(&vp->v_rdcnt);
 	if (filemode & FWRITE)
 		atomic_inc_32(&vp->v_wrcnt);
 
 }
 
 void
 vn_open_downgrade(
 	vnode_t *vp,
 	int filemode)
 {
 	ASSERT(vp->v_type == VREG);
 
 	if (filemode & FREAD) {
 		ASSERT(vp->v_rdcnt > 0);
 		atomic_dec_32(&vp->v_rdcnt);
 	}
 	if (filemode & FWRITE) {
 		ASSERT(vp->v_wrcnt > 0);
 		atomic_dec_32(&vp->v_wrcnt);
 	}
 
 }
 
 int
 vn_create(
 	char *pnamep,
 	enum uio_seg seg,
 	struct vattr *vap,
 	enum vcexcl excl,
 	int mode,
 	struct vnode **vpp,
 	enum create why,
 	int flag,
 	mode_t umask)
 {
 	return (vn_createat(pnamep, seg, vap, excl, mode, vpp, why, flag,
 	    umask, NULL));
 }
 
 /*
  * Create a vnode (makenode).
  */
 int
 vn_createat(
 	char *pnamep,
 	enum uio_seg seg,
 	struct vattr *vap,
 	enum vcexcl excl,
 	int mode,
 	struct vnode **vpp,
 	enum create why,
 	int flag,
 	mode_t umask,
 	struct vnode *startvp)
 {
 	struct vnode *dvp;	/* ptr to parent dir vnode */
 	struct vnode *vp = NULL;
 	struct pathname pn;
 	int error;
 	int in_crit = 0;
 	struct vattr vattr;
 	enum symfollow follow;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
 	/* symlink interpretation */
 	if ((flag & FNOFOLLOW) || excl == EXCL)
 		follow = NO_FOLLOW;
 	else
 		follow = FOLLOW;
 	flag &= ~(FNOFOLLOW|FNOLINKS);
 
 top:
 	/*
 	 * Lookup directory.
 	 * If new object is a file, call lower level to create it.
 	 * Note that it is up to the lower level to enforce exclusive
 	 * creation, if the file is already there.
 	 * This allows the lower level to do whatever
 	 * locking or protocol that is needed to prevent races.
 	 * If the new object is directory call lower level to make
 	 * the new directory, with "." and "..".
 	 */
 	if (error = pn_get(pnamep, seg, &pn))
 		return (error);
 	if (auditing)
 		audit_vncreate_start();
 	dvp = NULL;
 	*vpp = NULL;
 	/*
 	 * lookup will find the parent directory for the vnode.
 	 * When it is done the pn holds the name of the entry
 	 * in the directory.
 	 * If this is a non-exclusive create we also find the node itself.
 	 */
 	error = lookuppnat(&pn, NULL, follow, &dvp,
 	    (excl == EXCL) ? NULLVPP : vpp, startvp);
 	if (error) {
 		pn_free(&pn);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 		if (why == CRMKDIR && error == EINVAL)
 			error = EEXIST;		/* SVID */
 		return (error);
 	}
 
 	if (why != CRMKNOD)
 		vap->va_mode &= ~VSVTX;
 
 	/*
 	 * If default ACLs are defined for the directory don't apply the
 	 * umask if umask is passed.
 	 */
 
 	if (umask) {
 
 		vsecattr_t vsec;
 
 		vsec.vsa_aclcnt = 0;
 		vsec.vsa_aclentp = NULL;
 		vsec.vsa_dfaclcnt = 0;
 		vsec.vsa_dfaclentp = NULL;
 		vsec.vsa_mask = VSA_DFACLCNT;
 		error = VOP_GETSECATTR(dvp, &vsec, 0, CRED(), NULL);
 		/*
 		 * If error is ENOSYS then treat it as no error
 		 * Don't want to force all file systems to support
 		 * aclent_t style of ACL's.
 		 */
 		if (error == ENOSYS)
 			error = 0;
 		if (error) {
 			if (*vpp != NULL)
 				VN_RELE(*vpp);
 			goto out;
 		} else {
 			/*
 			 * Apply the umask if no default ACLs.
 			 */
 			if (vsec.vsa_dfaclcnt == 0)
 				vap->va_mode &= ~umask;
 
 			/*
 			 * VOP_GETSECATTR() may have allocated memory for
 			 * ACLs we didn't request, so double-check and
 			 * free it if necessary.
 			 */
 			if (vsec.vsa_aclcnt && vsec.vsa_aclentp != NULL)
 				kmem_free((caddr_t)vsec.vsa_aclentp,
 				    vsec.vsa_aclcnt * sizeof (aclent_t));
 			if (vsec.vsa_dfaclcnt && vsec.vsa_dfaclentp != NULL)
 				kmem_free((caddr_t)vsec.vsa_dfaclentp,
 				    vsec.vsa_dfaclcnt * sizeof (aclent_t));
 		}
 	}
 
 	/*
 	 * In general we want to generate EROFS if the file system is
 	 * readonly.  However, POSIX (IEEE Std. 1003.1) section 5.3.1
 	 * documents the open system call, and it says that O_CREAT has no
 	 * effect if the file already exists.  Bug 1119649 states
 	 * that open(path, O_CREAT, ...) fails when attempting to open an
 	 * existing file on a read only file system.  Thus, the first part
 	 * of the following if statement has 3 checks:
 	 *	if the file exists &&
 	 *		it is being open with write access &&
 	 *		the file system is read only
 	 *	then generate EROFS
 	 */
 	if ((*vpp != NULL && (mode & VWRITE) && ISROFILE(*vpp)) ||
 	    (*vpp == NULL && dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
 		if (*vpp)
 			VN_RELE(*vpp);
 		error = EROFS;
 	} else if (excl == NONEXCL && *vpp != NULL) {
 		vnode_t *rvp;
 
 		/*
 		 * File already exists.  If a mandatory lock has been
 		 * applied, return error.
 		 */
 		vp = *vpp;
 		if (VOP_REALVP(vp, &rvp, NULL) != 0)
 			rvp = vp;
 		if ((vap->va_mask & AT_SIZE) && nbl_need_check(vp)) {
 			nbl_start_crit(vp, RW_READER);
 			in_crit = 1;
 		}
 		if (rvp->v_filocks != NULL || rvp->v_shrlocks != NULL) {
 			vattr.va_mask = AT_MODE|AT_SIZE;
 			if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) {
 				goto out;
 			}
 			if (MANDLOCK(vp, vattr.va_mode)) {
 				error = EAGAIN;
 				goto out;
 			}
 			/*
 			 * File cannot be truncated if non-blocking mandatory
 			 * locks are currently on the file.
 			 */
 			if ((vap->va_mask & AT_SIZE) && in_crit) {
 				u_offset_t offset;
 				ssize_t length;
 
 				offset = vap->va_size > vattr.va_size ?
 				    vattr.va_size : vap->va_size;
 				length = vap->va_size > vattr.va_size ?
 				    vap->va_size - vattr.va_size :
 				    vattr.va_size - vap->va_size;
 				if (nbl_conflict(vp, NBL_WRITE, offset,
 				    length, 0, NULL)) {
 					error = EACCES;
 					goto out;
 				}
 			}
 		}
 
 		/*
 		 * If the file is the root of a VFS, we've crossed a
 		 * mount point and the "containing" directory that we
 		 * acquired above (dvp) is irrelevant because it's in
 		 * a different file system.  We apply VOP_CREATE to the
 		 * target itself instead of to the containing directory
 		 * and supply a null path name to indicate (conventionally)
 		 * the node itself as the "component" of interest.
 		 *
 		 * The intercession of the file system is necessary to
 		 * ensure that the appropriate permission checks are
 		 * done.
 		 */
 		if (vp->v_flag & VROOT) {
 			ASSERT(why != CRMKDIR);
 			error = VOP_CREATE(vp, "", vap, excl, mode, vpp,
 			    CRED(), flag, NULL, NULL);
 			/*
 			 * If the create succeeded, it will have created
 			 * a new reference to the vnode.  Give up the
 			 * original reference.  The assertion should not
 			 * get triggered because NBMAND locks only apply to
 			 * VREG files.  And if in_crit is non-zero for some
 			 * reason, detect that here, rather than when we
 			 * deference a null vp.
 			 */
 			ASSERT(in_crit == 0);
 			VN_RELE(vp);
 			vp = NULL;
 			goto out;
 		}
 
 		/*
 		 * Large File API - non-large open (FOFFMAX flag not set)
 		 * of regular file fails if the file size exceeds MAXOFF32_T.
 		 */
 		if (why != CRMKDIR &&
 		    !(flag & FOFFMAX) &&
 		    (vp->v_type == VREG)) {
 			vattr.va_mask = AT_SIZE;
 			if ((error = VOP_GETATTR(vp, &vattr, 0,
 			    CRED(), NULL))) {
 				goto out;
 			}
 			if ((vattr.va_size > (u_offset_t)MAXOFF32_T)) {
 				error = EOVERFLOW;
 				goto out;
 			}
 		}
 	}
 
 	if (error == 0) {
 		/*
 		 * Call mkdir() if specified, otherwise create().
 		 */
 		int must_be_dir = pn_fixslash(&pn);	/* trailing '/'? */
 
 		if (why == CRMKDIR)
 			/*
 			 * N.B., if vn_createat() ever requests
 			 * case-insensitive behavior then it will need
 			 * to be passed to VOP_MKDIR().  VOP_CREATE()
 			 * will already get it via "flag"
 			 */
 			error = VOP_MKDIR(dvp, pn.pn_path, vap, vpp, CRED(),
 			    NULL, 0, NULL);
 		else if (!must_be_dir)
 			error = VOP_CREATE(dvp, pn.pn_path, vap,
 			    excl, mode, vpp, CRED(), flag, NULL, NULL);
 		else
 			error = ENOTDIR;
 	}
 
 out:
 
 	if (auditing)
 		audit_vncreate_finish(*vpp, error);
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (vp != NULL) {
 		VN_RELE(vp);
 		vp = NULL;
 	}
 	pn_free(&pn);
 	VN_RELE(dvp);
 	/*
 	 * The following clause was added to handle a problem
 	 * with NFS consistency.  It is possible that a lookup
 	 * of the file to be created succeeded, but the file
 	 * itself doesn't actually exist on the server.  This
 	 * is chiefly due to the DNLC containing an entry for
 	 * the file which has been removed on the server.  In
 	 * this case, we just start over.  If there was some
 	 * other cause for the ESTALE error, then the lookup
 	 * of the file will fail and the error will be returned
 	 * above instead of looping around from here.
 	 */
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 int
 vn_link(char *from, char *to, enum uio_seg seg)
 {
 	return (vn_linkat(NULL, from, NO_FOLLOW, NULL, to, seg));
 }
 
 int
 vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow,
     vnode_t *tstartvp, char *to, enum uio_seg seg)
 {
 	struct vnode *fvp;		/* from vnode ptr */
 	struct vnode *tdvp;		/* to directory vnode ptr */
 	struct pathname pn;
 	int error;
 	struct vattr vattr;
 	dev_t fsid;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 top:
 	fvp = tdvp = NULL;
 	if (error = pn_get(to, seg, &pn))
 		return (error);
 	if (auditing && fstartvp != NULL)
 		audit_setfsat_path(1);
 	if (error = lookupnameat(from, seg, follow, NULLVPP, &fvp, fstartvp))
 		goto out;
 	if (auditing && tstartvp != NULL)
 		audit_setfsat_path(3);
 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &tdvp, NULLVPP, tstartvp))
 		goto out;
 	/*
 	 * Make sure both source vnode and target directory vnode are
 	 * in the same vfs and that it is writeable.
 	 */
 	vattr.va_mask = AT_FSID;
 	if (error = VOP_GETATTR(fvp, &vattr, 0, CRED(), NULL))
 		goto out;
 	fsid = vattr.va_fsid;
 	vattr.va_mask = AT_FSID;
 	if (error = VOP_GETATTR(tdvp, &vattr, 0, CRED(), NULL))
 		goto out;
 	if (fsid != vattr.va_fsid) {
 		error = EXDEV;
 		goto out;
 	}
 	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 	/*
 	 * Do the link.
 	 */
 	(void) pn_fixslash(&pn);
 	error = VOP_LINK(tdvp, fvp, pn.pn_path, CRED(), NULL, 0);
 out:
 	pn_free(&pn);
 	if (fvp)
 		VN_RELE(fvp);
 	if (tdvp)
 		VN_RELE(tdvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 int
 vn_rename(char *from, char *to, enum uio_seg seg)
 {
 	return (vn_renameat(NULL, from, NULL, to, seg));
 }
 
 int
 vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp,
 		char *tname, enum uio_seg seg)
 {
 	int error;
 	struct vattr vattr;
 	struct pathname fpn;		/* from pathname */
 	struct pathname tpn;		/* to pathname */
 	dev_t fsid;
 	int in_crit_src, in_crit_targ;
 	vnode_t *fromvp, *fvp;
 	vnode_t *tovp, *targvp;
 	int estale_retry = 0;
 	uint32_t auditing = AU_AUDITING();
 
 top:
 	fvp = fromvp = tovp = targvp = NULL;
 	in_crit_src = in_crit_targ = 0;
 	/*
 	 * Get to and from pathnames.
 	 */
 	if (error = pn_get(fname, seg, &fpn))
 		return (error);
 	if (error = pn_get(tname, seg, &tpn)) {
 		pn_free(&fpn);
 		return (error);
 	}
 
 	/*
 	 * First we need to resolve the correct directories
 	 * The passed in directories may only be a starting point,
 	 * but we need the real directories the file(s) live in.
 	 * For example the fname may be something like usr/lib/sparc
 	 * and we were passed in the / directory, but we need to
 	 * use the lib directory for the rename.
 	 */
 
 	if (auditing && fdvp != NULL)
 		audit_setfsat_path(1);
 	/*
 	 * Lookup to and from directories.
 	 */
 	if (error = lookuppnat(&fpn, NULL, NO_FOLLOW, &fromvp, &fvp, fdvp)) {
 		goto out;
 	}
 
 	/*
 	 * Make sure there is an entry.
 	 */
 	if (fvp == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 
 	if (auditing && tdvp != NULL)
 		audit_setfsat_path(3);
 	if (error = lookuppnat(&tpn, NULL, NO_FOLLOW, &tovp, &targvp, tdvp)) {
 		goto out;
 	}
 
 	/*
 	 * Make sure both the from vnode directory and the to directory
 	 * are in the same vfs and the to directory is writable.
 	 * We check fsid's, not vfs pointers, so loopback fs works.
 	 */
 	if (fromvp != tovp) {
 		vattr.va_mask = AT_FSID;
 		if (error = VOP_GETATTR(fromvp, &vattr, 0, CRED(), NULL))
 			goto out;
 		fsid = vattr.va_fsid;
 		vattr.va_mask = AT_FSID;
 		if (error = VOP_GETATTR(tovp, &vattr, 0, CRED(), NULL))
 			goto out;
 		if (fsid != vattr.va_fsid) {
 			error = EXDEV;
 			goto out;
 		}
 	}
 
 	if (tovp->v_vfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 
 	if (targvp && (fvp != targvp)) {
 		nbl_start_crit(targvp, RW_READER);
 		in_crit_targ = 1;
 		if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	}
 
 	if (nbl_need_check(fvp)) {
 		nbl_start_crit(fvp, RW_READER);
 		in_crit_src = 1;
 		if (nbl_conflict(fvp, NBL_RENAME, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	}
 
 	/*
 	 * Do the rename.
 	 */
 	(void) pn_fixslash(&tpn);
 	error = VOP_RENAME(fromvp, fpn.pn_path, tovp, tpn.pn_path, CRED(),
 	    NULL, 0);
 
 out:
 	pn_free(&fpn);
 	pn_free(&tpn);
 	if (in_crit_src)
 		nbl_end_crit(fvp);
 	if (in_crit_targ)
 		nbl_end_crit(targvp);
 	if (fromvp)
 		VN_RELE(fromvp);
 	if (tovp)
 		VN_RELE(tovp);
 	if (targvp)
 		VN_RELE(targvp);
 	if (fvp)
 		VN_RELE(fvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 /*
  * Remove a file or directory.
  */
 int
 vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 	return (vn_removeat(NULL, fnamep, seg, dirflag));
 }
 
 int
 vn_removeat(vnode_t *startvp, char *fnamep, enum uio_seg seg, enum rm dirflag)
 {
 	struct vnode *vp;		/* entry vnode */
 	struct vnode *dvp;		/* ptr to parent dir vnode */
 	struct vnode *coveredvp;
 	struct pathname pn;		/* name of entry */
 	enum vtype vtype;
 	int error;
 	struct vfs *vfsp;
 	struct vfs *dvfsp;	/* ptr to parent dir vfs */
 	int in_crit = 0;
 	int estale_retry = 0;
 
 top:
 	if (error = pn_get(fnamep, seg, &pn))
 		return (error);
 	dvp = vp = NULL;
 	if (error = lookuppnat(&pn, NULL, NO_FOLLOW, &dvp, &vp, startvp)) {
 		pn_free(&pn);
 		if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 			goto top;
 		return (error);
 	}
 
 	/*
 	 * Make sure there is an entry.
 	 */
 	if (vp == NULL) {
 		error = ENOENT;
 		goto out;
 	}
 
 	vfsp = vp->v_vfsp;
 	dvfsp = dvp->v_vfsp;
 
 	/*
 	 * If the named file is the root of a mounted filesystem, fail,
 	 * unless it's marked unlinkable.  In that case, unmount the
 	 * filesystem and proceed to unlink the covered vnode.  (If the
 	 * covered vnode is a directory, use rmdir instead of unlink,
 	 * to avoid file system corruption.)
 	 */
 	if (vp->v_flag & VROOT) {
 		if ((vfsp->vfs_flag & VFS_UNLINKABLE) == 0) {
 			error = EBUSY;
 			goto out;
 		}
 
 		/*
 		 * Namefs specific code starts here.
 		 */
 
 		if (dirflag == RMDIRECTORY) {
 			/*
 			 * User called rmdir(2) on a file that has
 			 * been namefs mounted on top of.  Since
 			 * namefs doesn't allow directories to
 			 * be mounted on other files we know
 			 * vp is not of type VDIR so fail to operation.
 			 */
 			error = ENOTDIR;
 			goto out;
 		}
 
 		/*
 		 * If VROOT is still set after grabbing vp->v_lock,
 		 * noone has finished nm_unmount so far and coveredvp
 		 * is valid.
 		 * If we manage to grab vn_vfswlock(coveredvp) before releasing
 		 * vp->v_lock, any race window is eliminated.
 		 */
 
 		mutex_enter(&vp->v_lock);
 		if ((vp->v_flag & VROOT) == 0) {
 			/* Someone beat us to the unmount */
 			mutex_exit(&vp->v_lock);
 			error = EBUSY;
 			goto out;
 		}
 		vfsp = vp->v_vfsp;
 		coveredvp = vfsp->vfs_vnodecovered;
 		ASSERT(coveredvp);
 		/*
 		 * Note: Implementation of vn_vfswlock shows that ordering of
 		 * v_lock / vn_vfswlock is not an issue here.
 		 */
 		error = vn_vfswlock(coveredvp);
 		mutex_exit(&vp->v_lock);
 
 		if (error)
 			goto out;
 
 		VN_HOLD(coveredvp);
 		VN_RELE(vp);
 		error = dounmount(vfsp, 0, CRED());
 
 		/*
 		 * Unmounted the namefs file system; now get
 		 * the object it was mounted over.
 		 */
 		vp = coveredvp;
 		/*
 		 * If namefs was mounted over a directory, then
 		 * we want to use rmdir() instead of unlink().
 		 */
 		if (vp->v_type == VDIR)
 			dirflag = RMDIRECTORY;
 
 		if (error)
 			goto out;
 	}
 
 	/*
 	 * Make sure filesystem is writeable.
 	 * We check the parent directory's vfs in case this is an lofs vnode.
 	 */
 	if (dvfsp && dvfsp->vfs_flag & VFS_RDONLY) {
 		error = EROFS;
 		goto out;
 	}
 
 	vtype = vp->v_type;
 
 	/*
 	 * If there is the possibility of an nbmand share reservation, make
 	 * sure it's okay to remove the file.  Keep a reference to the
 	 * vnode, so that we can exit the nbl critical region after
 	 * calling VOP_REMOVE.
 	 * If there is no possibility of an nbmand share reservation,
 	 * release the vnode reference now.  Filesystems like NFS may
 	 * behave differently if there is an extra reference, so get rid of
 	 * this one.  Fortunately, we can't have nbmand mounts on NFS
 	 * filesystems.
 	 */
 	if (nbl_need_check(vp)) {
 		nbl_start_crit(vp, RW_READER);
 		in_crit = 1;
 		if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, NULL)) {
 			error = EACCES;
 			goto out;
 		}
 	} else {
 		VN_RELE(vp);
 		vp = NULL;
 	}
 
 	if (dirflag == RMDIRECTORY) {
 		/*
 		 * Caller is using rmdir(2), which can only be applied to
 		 * directories.
 		 */
 		if (vtype != VDIR) {
 			error = ENOTDIR;
 		} else {
 			vnode_t *cwd;
 			proc_t *pp = curproc;
 
 			mutex_enter(&pp->p_lock);
 			cwd = PTOU(pp)->u_cdir;
 			VN_HOLD(cwd);
 			mutex_exit(&pp->p_lock);
 			error = VOP_RMDIR(dvp, pn.pn_path, cwd, CRED(),
 			    NULL, 0);
 			VN_RELE(cwd);
 		}
 	} else {
 		/*
 		 * Unlink(2) can be applied to anything.
 		 */
 		error = VOP_REMOVE(dvp, pn.pn_path, CRED(), NULL, 0);
 	}
 
 out:
 	pn_free(&pn);
 	if (in_crit) {
 		nbl_end_crit(vp);
 		in_crit = 0;
 	}
 	if (vp != NULL)
 		VN_RELE(vp);
 	if (dvp != NULL)
 		VN_RELE(dvp);
 	if ((error == ESTALE) && fs_need_estale_retry(estale_retry++))
 		goto top;
 	return (error);
 }
 
 /*
  * Utility function to compare equality of vnodes.
  * Compare the underlying real vnodes, if there are underlying vnodes.
  * This is a more thorough comparison than the VN_CMP() macro provides.
  */
 int
 vn_compare(vnode_t *vp1, vnode_t *vp2)
 {
 	vnode_t *realvp;
 
 	if (vp1 != NULL && VOP_REALVP(vp1, &realvp, NULL) == 0)
 		vp1 = realvp;
 	if (vp2 != NULL && VOP_REALVP(vp2, &realvp, NULL) == 0)
 		vp2 = realvp;
 	return (VN_CMP(vp1, vp2));
 }
 
 /*
  * The number of locks to hash into.  This value must be a power
  * of 2 minus 1 and should probably also be prime.
  */
 #define	NUM_BUCKETS	1023
 
 struct  vn_vfslocks_bucket {
 	kmutex_t vb_lock;
 	vn_vfslocks_entry_t *vb_list;
 	char pad[64 - sizeof (kmutex_t) - sizeof (void *)];
 };
 
 /*
  * Total number of buckets will be NUM_BUCKETS + 1 .
  */
 
 #pragma	align	64(vn_vfslocks_buckets)
 static	struct vn_vfslocks_bucket	vn_vfslocks_buckets[NUM_BUCKETS + 1];
 
 #define	VN_VFSLOCKS_SHIFT	9
 
 #define	VN_VFSLOCKS_HASH(vfsvpptr)	\
 	((((intptr_t)(vfsvpptr)) >> VN_VFSLOCKS_SHIFT) & NUM_BUCKETS)
 
 /*
  * vn_vfslocks_getlock() uses an HASH scheme to generate
  * rwstlock using vfs/vnode pointer passed to it.
  *
  * vn_vfslocks_rele() releases a reference in the
  * HASH table which allows the entry allocated by
  * vn_vfslocks_getlock() to be freed at a later
  * stage when the refcount drops to zero.
  */
 
 vn_vfslocks_entry_t *
 vn_vfslocks_getlock(void *vfsvpptr)
 {
 	struct vn_vfslocks_bucket *bp;
 	vn_vfslocks_entry_t *vep;
 	vn_vfslocks_entry_t *tvep;
 
 	ASSERT(vfsvpptr != NULL);
 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vfsvpptr)];
 
 	mutex_enter(&bp->vb_lock);
 	for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
 		if (vep->ve_vpvfs == vfsvpptr) {
 			vep->ve_refcnt++;
 			mutex_exit(&bp->vb_lock);
 			return (vep);
 		}
 	}
 	mutex_exit(&bp->vb_lock);
 	vep = kmem_alloc(sizeof (*vep), KM_SLEEP);
 	rwst_init(&vep->ve_lock, NULL, RW_DEFAULT, NULL);
 	vep->ve_vpvfs = (char *)vfsvpptr;
 	vep->ve_refcnt = 1;
 	mutex_enter(&bp->vb_lock);
 	for (tvep = bp->vb_list; tvep != NULL; tvep = tvep->ve_next) {
 		if (tvep->ve_vpvfs == vfsvpptr) {
 			tvep->ve_refcnt++;
 			mutex_exit(&bp->vb_lock);
 
 			/*
 			 * There is already an entry in the hash
 			 * destroy what we just allocated.
 			 */
 			rwst_destroy(&vep->ve_lock);
 			kmem_free(vep, sizeof (*vep));
 			return (tvep);
 		}
 	}
 	vep->ve_next = bp->vb_list;
 	bp->vb_list = vep;
 	mutex_exit(&bp->vb_lock);
 	return (vep);
 }
 
 void
 vn_vfslocks_rele(vn_vfslocks_entry_t *vepent)
 {
 	struct vn_vfslocks_bucket *bp;
 	vn_vfslocks_entry_t *vep;
 	vn_vfslocks_entry_t *pvep;
 
 	ASSERT(vepent != NULL);
 	ASSERT(vepent->ve_vpvfs != NULL);
 
 	bp = &vn_vfslocks_buckets[VN_VFSLOCKS_HASH(vepent->ve_vpvfs)];
 
 	mutex_enter(&bp->vb_lock);
 	vepent->ve_refcnt--;
 
 	if ((int32_t)vepent->ve_refcnt < 0)
 		cmn_err(CE_PANIC, "vn_vfslocks_rele: refcount negative");
 
 	if (vepent->ve_refcnt == 0) {
 		for (vep = bp->vb_list; vep != NULL; vep = vep->ve_next) {
 			if (vep->ve_vpvfs == vepent->ve_vpvfs) {
 				if (bp->vb_list == vep)
 					bp->vb_list = vep->ve_next;
 				else {
 					/* LINTED */
 					pvep->ve_next = vep->ve_next;
 				}
 				mutex_exit(&bp->vb_lock);
 				rwst_destroy(&vep->ve_lock);
 				kmem_free(vep, sizeof (*vep));
 				return;
 			}
 			pvep = vep;
 		}
 		cmn_err(CE_PANIC, "vn_vfslocks_rele: vp/vfs not found");
 	}
 	mutex_exit(&bp->vb_lock);
 }
 
 /*
  * vn_vfswlock_wait is used to implement a lock which is logically a writers
  * lock protecting the v_vfsmountedhere field.
  * vn_vfswlock_wait has been modified to be similar to vn_vfswlock,
  * except that it blocks to acquire the lock VVFSLOCK.
  *
  * traverse() and routines re-implementing part of traverse (e.g. autofs)
  * need to hold this lock. mount(), vn_rename(), vn_remove() and so on
  * need the non-blocking version of the writers lock i.e. vn_vfswlock
  */
 int
 vn_vfswlock_wait(vnode_t *vp)
 {
 	int retval;
 	vn_vfslocks_entry_t *vpvfsentry;
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_WRITER);
 
 	if (retval == EINTR) {
 		vn_vfslocks_rele(vpvfsentry);
 		return (EINTR);
 	}
 	return (retval);
 }
 
 int
 vn_vfsrlock_wait(vnode_t *vp)
 {
 	int retval;
 	vn_vfslocks_entry_t *vpvfsentry;
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	retval = rwst_enter_sig(&vpvfsentry->ve_lock, RW_READER);
 
 	if (retval == EINTR) {
 		vn_vfslocks_rele(vpvfsentry);
 		return (EINTR);
 	}
 
 	return (retval);
 }
 
 
 /*
  * vn_vfswlock is used to implement a lock which is logically a writers lock
  * protecting the v_vfsmountedhere field.
  */
 int
 vn_vfswlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * If vp is NULL then somebody is trying to lock the covered vnode
 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
 	 * only happen when unmounting /.  Since that operation will fail
 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
 	 */
 	if (vp == NULL)
 		return (EBUSY);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 
 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
 		return (0);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (EBUSY);
 }
 
 int
 vn_vfsrlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * If vp is NULL then somebody is trying to lock the covered vnode
 	 * of /.  (vfs_vnodecovered is NULL for /).  This situation will
 	 * only happen when unmounting /.  Since that operation will fail
 	 * anyway, return EBUSY here instead of in VFS_UNMOUNT.
 	 */
 	if (vp == NULL)
 		return (EBUSY);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 
 	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
 		return (0);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (EBUSY);
 }
 
 void
 vn_vfsunlock(vnode_t *vp)
 {
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	/*
 	 * ve_refcnt needs to be decremented twice.
 	 * 1. To release refernce after a call to vn_vfslocks_getlock()
 	 * 2. To release the reference from the locking routines like
 	 *    vn_vfsrlock/vn_vfswlock etc,.
 	 */
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	vn_vfslocks_rele(vpvfsentry);
 
 	rwst_exit(&vpvfsentry->ve_lock);
 	vn_vfslocks_rele(vpvfsentry);
 }
 
 int
 vn_vfswlock_held(vnode_t *vp)
 {
 	int held;
 	vn_vfslocks_entry_t *vpvfsentry;
 
 	ASSERT(vp != NULL);
 
 	vpvfsentry = vn_vfslocks_getlock(vp);
 	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
 
 	vn_vfslocks_rele(vpvfsentry);
 	return (held);
 }
 
 
 int
 vn_make_ops(
 	const char *name,			/* Name of file system */
 	const fs_operation_def_t *templ,	/* Operation specification */
 	vnodeops_t **actual)			/* Return the vnodeops */
 {
 	int unused_ops;
 	int error;
 
 	*actual = (vnodeops_t *)kmem_alloc(sizeof (vnodeops_t), KM_SLEEP);
 
 	(*actual)->vnop_name = name;
 
 	error = fs_build_vector(*actual, &unused_ops, vn_ops_table, templ);
 	if (error) {
 		kmem_free(*actual, sizeof (vnodeops_t));
 	}
 
 #if DEBUG
 	if (unused_ops != 0)
 		cmn_err(CE_WARN, "vn_make_ops: %s: %d operations supplied "
 		    "but not used", name, unused_ops);
 #endif
 
 	return (error);
 }
 
 /*
  * Free the vnodeops created as a result of vn_make_ops()
  */
 void
 vn_freevnodeops(vnodeops_t *vnops)
 {
 	kmem_free(vnops, sizeof (vnodeops_t));
 }
 
 /*
  * Vnode cache.
  */
 
 /* ARGSUSED */
 static int
 vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
 {
 	struct vnode *vp;
 
 	vp = buf;
 
 	mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vp->v_vsd_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
 	vp->v_path = vn_vpath_empty;
 	vp->v_path_stamp = 0;
 	vp->v_mpssdata = NULL;
 	vp->v_vsd = NULL;
 	vp->v_fopdata = NULL;
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 vn_cache_destructor(void *buf, void *cdrarg)
 {
 	struct vnode *vp;
 
 	vp = buf;
 
 	rw_destroy(&vp->v_nbllock);
 	cv_destroy(&vp->v_cv);
 	mutex_destroy(&vp->v_vsd_lock);
 	mutex_destroy(&vp->v_lock);
 }
 
 void
 vn_create_cache(void)
 {
 	/* LINTED */
 	ASSERT((1 << VNODE_ALIGN_LOG2) ==
 	    P2ROUNDUP(sizeof (struct vnode), VNODE_ALIGN));
 	vn_cache = kmem_cache_create("vn_cache", sizeof (struct vnode),
 	    VNODE_ALIGN, vn_cache_constructor, vn_cache_destructor, NULL, NULL,
 	    NULL, 0);
 }
 
 void
 vn_destroy_cache(void)
 {
 	kmem_cache_destroy(vn_cache);
 }
 
 /*
  * Used by file systems when fs-specific nodes (e.g., ufs inodes) are
  * cached by the file system and vnodes remain associated.
  */
 void
 vn_recycle(vnode_t *vp)
 {
 	ASSERT(vp->v_pages == NULL);
 	VERIFY(vp->v_path != NULL);
 
 	/*
 	 * XXX - This really belongs in vn_reinit(), but we have some issues
 	 * with the counts.  Best to have it here for clean initialization.
 	 */
 	vp->v_rdcnt = 0;
 	vp->v_wrcnt = 0;
 	vp->v_mmap_read = 0;
 	vp->v_mmap_write = 0;
 
 	/*
 	 * If FEM was in use, make sure everything gets cleaned up
 	 * NOTE: vp->v_femhead is initialized to NULL in the vnode
 	 * constructor.
 	 */
 	if (vp->v_femhead) {
 		/* XXX - There should be a free_femhead() that does all this */
 		ASSERT(vp->v_femhead->femh_list == NULL);
 		mutex_destroy(&vp->v_femhead->femh_lock);
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
 	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
 		vp->v_path = vn_vpath_empty;
 	}
 	vp->v_path_stamp = 0;
 
 	if (vp->v_fopdata != NULL) {
 		free_fopdata(vp);
 	}
 	vp->v_mpssdata = NULL;
 	vsd_free(vp);
 }
 
 /*
  * Used to reset the vnode fields including those that are directly accessible
  * as well as those which require an accessor function.
  *
  * Does not initialize:
  *	synchronization objects: v_lock, v_vsd_lock, v_nbllock, v_cv
  *	v_data (since FS-nodes and vnodes point to each other and should
  *		be updated simultaneously)
  *	v_op (in case someone needs to make a VOP call on this object)
  */
 void
 vn_reinit(vnode_t *vp)
 {
 	vp->v_count = 1;
 	vp->v_count_dnlc = 0;
 	vp->v_vfsp = NULL;
 	vp->v_stream = NULL;
 	vp->v_vfsmountedhere = NULL;
 	vp->v_flag = 0;
 	vp->v_type = VNON;
 	vp->v_rdev = NODEV;
 
 	vp->v_filocks = NULL;
 	vp->v_shrlocks = NULL;
 	vp->v_pages = NULL;
 
 	vp->v_locality = NULL;
 	vp->v_xattrdir = NULL;
 
 	/*
 	 * In a few specific instances, vn_reinit() is used to initialize
 	 * locally defined vnode_t instances.  Lacking the construction offered
 	 * by vn_alloc(), these vnodes require v_path initialization.
 	 */
 	if (vp->v_path == NULL) {
 		vp->v_path = vn_vpath_empty;
 	}
 
 	/* Handles v_femhead, v_path, and the r/w/map counts */
 	vn_recycle(vp);
 }
 
 vnode_t *
 vn_alloc(int kmflag)
 {
 	vnode_t *vp;
 
 	vp = kmem_cache_alloc(vn_cache, kmflag);
 
 	if (vp != NULL) {
 		vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
 		vp->v_fopdata = NULL;
 		vn_reinit(vp);
 	}
 
 	return (vp);
 }
 
 void
 vn_free(vnode_t *vp)
 {
 	ASSERT(vp->v_shrlocks == NULL);
 	ASSERT(vp->v_filocks == NULL);
 
 	/*
 	 * Some file systems call vn_free() with v_count of zero,
 	 * some with v_count of 1.  In any case, the value should
 	 * never be anything else.
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
 	VERIFY(vp->v_path != NULL);
 	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
 		vp->v_path = vn_vpath_empty;
 	}
 
 	/* If FEM was in use, make sure everything gets cleaned up */
 	if (vp->v_femhead) {
 		/* XXX - There should be a free_femhead() that does all this */
 		ASSERT(vp->v_femhead->femh_list == NULL);
 		mutex_destroy(&vp->v_femhead->femh_lock);
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
 
 	if (vp->v_fopdata != NULL) {
 		free_fopdata(vp);
 	}
 	vp->v_mpssdata = NULL;
 	vsd_free(vp);
 	kmem_cache_free(vn_cache, vp);
 }
 
 /*
  * vnode status changes, should define better states than 1, 0.
  */
 void
 vn_reclaim(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_RECLAIMED);
 }
 
 void
 vn_idle(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_IDLED);
 }
 void
 vn_exists(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_EXISTS);
 }
 
 void
 vn_invalid(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 
 	if (vfsp == NULL ||
 	    vfsp->vfs_implp == NULL || vfsp->vfs_femhead == NULL) {
 		return;
 	}
 	(void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
 }
 
 /* Vnode event notification */
 
 int
 vnevent_support(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL)
 		return (EINVAL);
 
 	return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
 }
 
 void
 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
 }
 
 void
 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
 }
 
 void
 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
 }
 
 void
 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
 }
 
 void
 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_src(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_SRC, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST, dvp, name, ct);
 }
 
 void
 vnevent_pre_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
     caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_PRE_RENAME_DEST_DIR, nvp, name, ct);
 }
 
 void
 vnevent_create(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_CREATE, NULL, NULL, ct);
 }
 
 void
 vnevent_link(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_LINK, NULL, NULL, ct);
 }
 
 void
 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
 }
 
 void
 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
 {
 	if (vp == NULL || vp->v_femhead == NULL) {
 		return;
 	}
 	(void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
 }
 
 /*
  * Vnode accessors.
  */
 
 int
 vn_is_readonly(vnode_t *vp)
 {
 	return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
 }
 
 int
 vn_has_flocks(vnode_t *vp)
 {
 	return (vp->v_filocks != NULL);
 }
 
 int
 vn_has_mandatory_locks(vnode_t *vp, int mode)
 {
 	return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));
 }
 
 int
 vn_has_cached_data(vnode_t *vp)
 {
 	return (vp->v_pages != NULL);
 }
 
 /*
  * Return 0 if the vnode in question shouldn't be permitted into a zone via
  * zone_enter(2).
  */
 int
 vn_can_change_zones(vnode_t *vp)
 {
 	struct vfssw *vswp;
 	int allow = 1;
 	vnode_t *rvp;
 
 	if (nfs_global_client_only != 0)
 		return (1);
 
 	/*
 	 * We always want to look at the underlying vnode if there is one.
 	 */
 	if (VOP_REALVP(vp, &rvp, NULL) != 0)
 		rvp = vp;
 	/*
 	 * Some pseudo filesystems (including doorfs) don't actually register
 	 * their vfsops_t, so the following may return NULL; we happily let
 	 * such vnodes switch zones.
 	 */
 	vswp = vfs_getvfsswbyvfsops(vfs_getops(rvp->v_vfsp));
 	if (vswp != NULL) {
 		if (vswp->vsw_flag & VSW_NOTZONESAFE)
 			allow = 0;
 		vfs_unrefvfssw(vswp);
 	}
 	return (allow);
 }
 
 /*
  * Return nonzero if the vnode is a mount point, zero if not.
  */
 int
 vn_ismntpt(vnode_t *vp)
 {
 	return (vp->v_vfsmountedhere != NULL);
 }
 
 /* Retrieve the vfs (if any) mounted on this vnode */
 vfs_t *
 vn_mountedvfs(vnode_t *vp)
 {
 	return (vp->v_vfsmountedhere);
 }
 
 /*
  * Return nonzero if the vnode is referenced by the dnlc, zero if not.
  */
 int
 vn_in_dnlc(vnode_t *vp)
 {
 	return (vp->v_count_dnlc > 0);
 }
 
 /*
  * vn_has_other_opens() checks whether a particular file is opened by more than
  * just the caller and whether the open is for read and/or write.
  * This routine is for calling after the caller has already called VOP_OPEN()
  * and the caller wishes to know if they are the only one with it open for
  * the mode(s) specified.
  *
  * Vnode counts are only kept on regular files (v_type=VREG).
  */
 int
 vn_has_other_opens(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_wrcnt > 1)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if ((vp->v_rdcnt > 1) || (vp->v_wrcnt > 1))
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if ((vp->v_rdcnt > 1) && (vp->v_wrcnt > 1))
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_rdcnt > 1)
 			return (V_TRUE);
 		break;
 	}
 
 	return (V_FALSE);
 }
 
 /*
  * vn_is_opened() checks whether a particular file is opened and
  * whether the open is for read and/or write.
  *
  * Vnode counts are only kept on regular files (v_type=VREG).
  */
 int
 vn_is_opened(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if (vp->v_rdcnt && vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if (vp->v_rdcnt || vp->v_wrcnt)
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_rdcnt)
 			return (V_TRUE);
 		break;
 	}
 
 	return (V_FALSE);
 }
 
 /*
  * vn_is_mapped() checks whether a particular file is mapped and whether
  * the file is mapped read and/or write.
  */
 int
 vn_is_mapped(
 	vnode_t *vp,
 	v_mode_t mode)
 {
 
 	ASSERT(vp != NULL);
 
 #if !defined(_LP64)
 	switch (mode) {
 	/*
 	 * The atomic_add_64_nv functions force atomicity in the
 	 * case of 32 bit architectures. Otherwise the 64 bit values
 	 * require two fetches. The value of the fields may be
 	 * (potentially) changed between the first fetch and the
 	 * second
 	 */
 	case V_WRITE:
 		if (atomic_add_64_nv((&(vp->v_mmap_write)), 0))
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) &&
 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if ((atomic_add_64_nv((&(vp->v_mmap_read)), 0)) ||
 		    (atomic_add_64_nv((&(vp->v_mmap_write)), 0)))
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (atomic_add_64_nv((&(vp->v_mmap_read)), 0))
 			return (V_TRUE);
 		break;
 	}
 #else
 	switch (mode) {
 	case V_WRITE:
 		if (vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_RDANDWR:
 		if (vp->v_mmap_read && vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_RDORWR:
 		if (vp->v_mmap_read || vp->v_mmap_write)
 			return (V_TRUE);
 		break;
 	case V_READ:
 		if (vp->v_mmap_read)
 			return (V_TRUE);
 		break;
 	}
 #endif
 
 	return (V_FALSE);
 }
 
 /*
  * Set the operations vector for a vnode.
  *
  * FEM ensures that the v_femhead pointer is filled in before the
  * v_op pointer is changed.  This means that if the v_femhead pointer
  * is NULL, and the v_op field hasn't changed since before which checked
  * the v_femhead pointer; then our update is ok - we are not racing with
  * FEM.
  */
 void
 vn_setops(vnode_t *vp, vnodeops_t *vnodeops)
 {
 	vnodeops_t	*op;
 
 	ASSERT(vp != NULL);
 	ASSERT(vnodeops != NULL);
 
 	op = vp->v_op;
 	membar_consumer();
 	/*
 	 * If vp->v_femhead == NULL, then we'll call atomic_cas_ptr() to do
 	 * the compare-and-swap on vp->v_op.  If either fails, then FEM is
 	 * in effect on the vnode and we need to have FEM deal with it.
 	 */
 	if (vp->v_femhead != NULL || atomic_cas_ptr(&vp->v_op, op, vnodeops) !=
 	    op) {
 		fem_setvnops(vp, vnodeops);
 	}
 }
 
 /*
  * Retrieve the operations vector for a vnode
  * As with vn_setops(above); make sure we aren't racing with FEM.
  * FEM sets the v_op to a special, internal, vnodeops that wouldn't
  * make sense to the callers of this routine.
  */
 vnodeops_t *
 vn_getops(vnode_t *vp)
 {
 	vnodeops_t	*op;
 
 	ASSERT(vp != NULL);
 
 	op = vp->v_op;
 	membar_consumer();
 	if (vp->v_femhead == NULL && op == vp->v_op) {
 		return (op);
 	} else {
 		return (fem_getvnops(vp));
 	}
 }
 
 /*
  * Returns non-zero (1) if the vnodeops matches that of the vnode.
  * Returns zero (0) if not.
  */
 int
 vn_matchops(vnode_t *vp, vnodeops_t *vnodeops)
 {
 	return (vn_getops(vp) == vnodeops);
 }
 
 /*
  * Returns non-zero (1) if the specified operation matches the
  * corresponding operation for that the vnode.
  * Returns zero (0) if not.
  */
 
 #define	MATCHNAME(n1, n2) (((n1)[0] == (n2)[0]) && (strcmp((n1), (n2)) == 0))
 
 int
 vn_matchopval(vnode_t *vp, char *vopname, fs_generic_func_p funcp)
 {
 	const fs_operation_trans_def_t *otdp;
 	fs_generic_func_p *loc = NULL;
 	vnodeops_t	*vop = vn_getops(vp);
 
 	ASSERT(vopname != NULL);
 
 	for (otdp = vn_ops_table; otdp->name != NULL; otdp++) {
 		if (MATCHNAME(otdp->name, vopname)) {
 			loc = (fs_generic_func_p *)
 			    ((char *)(vop) + otdp->offset);
 			break;
 		}
 	}
 
 	return ((loc != NULL) && (*loc == funcp));
 }
 
 /*
  * fs_new_caller_id() needs to return a unique ID on a given local system.
  * The IDs do not need to survive across reboots.  These are primarily
  * used so that (FEM) monitors can detect particular callers (such as
  * the NFS server) to a given vnode/vfs operation.
  */
 u_longlong_t
 fs_new_caller_id()
 {
 	static uint64_t next_caller_id = 0LL; /* First call returns 1 */
 
 	return ((u_longlong_t)atomic_inc_64_nv(&next_caller_id));
 }
 
 /*
  * The value stored in v_path is relative to rootdir, located in the global
  * zone.  Zones or chroot environments which reside deeper inside the VFS
  * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
  * what lies below their perceived root.  In order to keep v_path usable for
  * these child environments, its allocations are allowed to exceed MAXPATHLEN.
  *
  * An upper bound of max_vnode_path is placed upon v_path allocations to
  * prevent the system from going too wild at the behest of pathological
  * behavior from the operator.
  */
 size_t max_vnode_path = 4 * MAXPATHLEN;
 
 
 void
 vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
 {
 	char *buf;
 
 	mutex_enter(&vp->v_lock);
 	/*
 	 * If the snapshot of v_path_stamp passed in via compare_stamp does not
 	 * match the present value on the vnode, it indicates that subsequent
 	 * changes have occurred.  The v_path value is not cleared in this case
 	 * since the new value may be valid.
 	 */
 	if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
 		mutex_exit(&vp->v_lock);
 		return;
 	}
 	buf = vp->v_path;
 	vp->v_path = vn_vpath_empty;
 	vp->v_path_stamp = 0;
 	mutex_exit(&vp->v_lock);
 	if (buf != vn_vpath_empty) {
 		kmem_free(buf, strlen(buf) + 1);
 	}
 }
 
 static void
 vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
     boolean_t is_rename)
 {
 	char *buf, *oldbuf;
 	hrtime_t pstamp;
 	size_t baselen, buflen = 0;
 
 	/* Handle the vn_setpath_str case. */
 	if (pvp == NULL) {
 		if (len + 1 > max_vnode_path) {
 			DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
 			    vnode_t *, vp, char *, name, size_t, len + 1);
 			return;
 		}
 		buf = kmem_alloc(len + 1, KM_SLEEP);
 		bcopy(name, buf, len);
 		buf[len] = '\0';
 
 		mutex_enter(&vp->v_lock);
 		oldbuf = vp->v_path;
 		vp->v_path = buf;
 		vp->v_path_stamp = gethrtime();
 		mutex_exit(&vp->v_lock);
 		if (oldbuf != vn_vpath_empty) {
 			kmem_free(oldbuf, strlen(oldbuf) + 1);
 		}
 		return;
 	}
 
 	/* Take snapshot of parent dir */
 	mutex_enter(&pvp->v_lock);
 
 	if ((pvp->v_flag & VTRAVERSE) != 0) {
 		/*
 		 * When the parent vnode has VTRAVERSE set in its flags, normal
 		 * assumptions about v_path calculation no longer apply.  The
 		 * primary situation where this occurs is via the VFS tricks
 		 * which procfs plays in order to allow /proc/PID/(root|cwd) to
 		 * yield meaningful results.
 		 *
 		 * When this flag is set, v_path on the child must not be
 		 * updated since the calculated value is likely to be
 		 * incorrect, given the current context.
 		 */
 		mutex_exit(&pvp->v_lock);
 		return;
 	}
 
 retrybuf:
 	if (pvp->v_path == vn_vpath_empty) {
 		/*
 		 * Without v_path from the parent directory, generating a child
 		 * path from the name is impossible.
 		 */
 		if (len > 0) {
 			pstamp = pvp->v_path_stamp;
 			mutex_exit(&pvp->v_lock);
 			vn_clearpath(vp, pstamp);
 			return;
 		}
 
 		/*
 		 * The only feasible case here is where a NUL lookup is being
 		 * performed on rootdir prior to its v_path being populated.
 		 */
 		ASSERT(pvp->v_path_stamp == 0);
 		baselen = 0;
 		pstamp = 0;
 	} else {
 		pstamp = pvp->v_path_stamp;
 		baselen = strlen(pvp->v_path);
 		/* ignore a trailing slash if present */
 		if (pvp->v_path[baselen - 1] == '/') {
 			/* This should only the be case for rootdir */
 			ASSERT(baselen == 1 && pvp == rootdir);
 			baselen--;
 		}
 	}
 	mutex_exit(&pvp->v_lock);
 
 	if (buflen != 0) {
 		/* Free the existing (mis-sized) buffer in case of retry */
 		kmem_free(buf, buflen);
 	}
 	/* base, '/', name and trailing NUL */
 	buflen = baselen + len + 2;
 	if (buflen > max_vnode_path) {
 		DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
 		    vnode_t *, vp, char *, name, size_t, buflen);
 		return;
 	}
 	buf = kmem_alloc(buflen, KM_SLEEP);
 
 	mutex_enter(&pvp->v_lock);
 	if (pvp->v_path_stamp != pstamp) {
 		size_t vlen;
 
 		/*
 		 * Since v_path_stamp changed on the parent, it is likely that
 		 * v_path has been altered as well.  If the length does not
 		 * exactly match what was previously measured, the buffer
 		 * allocation must be repeated for proper sizing.
 		 */
 		if (pvp->v_path == vn_vpath_empty) {
 			/* Give up if parent lack v_path */
 			mutex_exit(&pvp->v_lock);
 			kmem_free(buf, buflen);
 			return;
 		}
 		vlen = strlen(pvp->v_path);
 		if (pvp->v_path[vlen - 1] == '/') {
 			vlen--;
 		}
 		if (vlen != baselen) {
 			goto retrybuf;
 		}
 	}
 	bcopy(pvp->v_path, buf, baselen);
 	mutex_exit(&pvp->v_lock);
 
 	buf[baselen] = '/';
 	baselen++;
 	bcopy(name, &buf[baselen], len + 1);
 
 	mutex_enter(&vp->v_lock);
 	if (vp->v_path_stamp == 0) {
 		/* never-visited vnode can inherit stamp from parent */
 		ASSERT(vp->v_path == vn_vpath_empty);
 		vp->v_path_stamp = pstamp;
 		vp->v_path = buf;
 		mutex_exit(&vp->v_lock);
 	} else if (vp->v_path_stamp < pstamp || is_rename) {
 		/*
 		 * Install the updated path and stamp, ensuring that the v_path
 		 * pointer is valid at all times for dtrace.
 		 */
 		oldbuf = vp->v_path;
 		vp->v_path = buf;
 		vp->v_path_stamp = gethrtime();
 		mutex_exit(&vp->v_lock);
 		kmem_free(oldbuf, strlen(oldbuf) + 1);
 	} else {
 		/*
 		 * If the timestamp matches or is greater, it means another
 		 * thread performed the update first while locks were dropped
 		 * here to make the allocation.  We defer to the newer value.
 		 */
 		mutex_exit(&vp->v_lock);
 		kmem_free(buf, buflen);
 	}
 	ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
 }
 
 void
 vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
 {
 	size_t len;
 
 	/*
 	 * If the parent is older or empty, there's nothing further to do.
 	 */
 	if (pvp->v_path == vn_vpath_empty ||
 	    pvp->v_path_stamp <= vp->v_path_stamp) {
 		return;
 	}
 
 	/*
 	 * Given the lack of appropriate context, meaningful updates to v_path
 	 * cannot be made for during lookups for the '.' or '..' entries.
 	 */
 	len = strlen(name);
 	if (len == 0 || (len == 1 && name[0] == '.') ||
 	    (len == 2 && name[0] == '.' && name[1] == '.')) {
 		return;
 	}
 
 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
 }
 
 /*
  * Given a starting vnode and a path, updates the path in the target vnode in
  * a safe manner.  If the vnode already has path information embedded, then the
  * cached path is left untouched.
  */
 /* ARGSUSED */
 void
 vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
     size_t len)
 {
 	vn_setpath_common(pvp, vp, name, len, B_FALSE);
 }
 
 /*
  * Sets the path to the vnode to be the given string, regardless of current
  * context.  The string must be a complete path from rootdir.  This is only used
  * by fsop_root() for setting the path based on the mountpoint.
  */
 void
 vn_setpath_str(vnode_t *vp, const char *str, size_t len)
 {
 	vn_setpath_common(NULL, vp, str, len, B_FALSE);
 }
 
 /*
  * Called from within filesystem's vop_rename() to handle renames once the
  * target vnode is available.
  */
 void
 vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
 {
 	vn_setpath_common(pvp, vp, name, len, B_TRUE);
 }
 
 /*
  * Similar to vn_setpath_str(), this function sets the path of the destination
  * vnode to the be the same as the source vnode.
  */
 void
 vn_copypath(struct vnode *src, struct vnode *dst)
 {
 	char *buf;
 	hrtime_t stamp;
 	size_t buflen;
 
 	mutex_enter(&src->v_lock);
 	if (src->v_path == vn_vpath_empty) {
 		mutex_exit(&src->v_lock);
 		return;
 	}
 	buflen = strlen(src->v_path) + 1;
 	mutex_exit(&src->v_lock);
 
 	buf = kmem_alloc(buflen, KM_SLEEP);
 
 	mutex_enter(&src->v_lock);
 	if (src->v_path == vn_vpath_empty ||
 	    strlen(src->v_path) + 1 != buflen) {
 		mutex_exit(&src->v_lock);
 		kmem_free(buf, buflen);
 		return;
 	}
 	bcopy(src->v_path, buf, buflen);
 	stamp = src->v_path_stamp;
 	mutex_exit(&src->v_lock);
 
 	mutex_enter(&dst->v_lock);
 	if (dst->v_path != vn_vpath_empty) {
 		mutex_exit(&dst->v_lock);
 		kmem_free(buf, buflen);
 		return;
 	}
 	dst->v_path = buf;
 	dst->v_path_stamp = stamp;
 	mutex_exit(&dst->v_lock);
 }
 
 
 /*
  * XXX Private interface for segvn routines that handle vnode
  * large page segments.
  *
  * return 1 if vp's file system VOP_PAGEIO() implementation
  * can be safely used instead of VOP_GETPAGE() for handling
  * pagefaults against regular non swap files. VOP_PAGEIO()
  * interface is considered safe here if its implementation
  * is very close to VOP_GETPAGE() implementation.
  * e.g. It zero's out the part of the page beyond EOF. Doesn't
  * panic if there're file holes but instead returns an error.
  * Doesn't assume file won't be changed by user writes, etc.
  *
  * return 0 otherwise.
  *
  * For now allow segvn to only use VOP_PAGEIO() with ufs and nfs.
  */
 int
 vn_vmpss_usepageio(vnode_t *vp)
 {
 	vfs_t   *vfsp = vp->v_vfsp;
 	char *fsname = vfssw[vfsp->vfs_fstype].vsw_name;
 	char *pageio_ok_fss[] = {"ufs", "nfs", NULL};
 	char **fsok = pageio_ok_fss;
 
 	if (fsname == NULL) {
 		return (0);
 	}
 
 	for (; *fsok; fsok++) {
 		if (strcmp(*fsok, fsname) == 0) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /* VOP_XXX() macros call the corresponding fop_xxx() function */
 
 int
 fop_open(
 	vnode_t **vpp,
 	int mode,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int ret;
 	vnode_t *vp = *vpp;
 
 	VN_HOLD(vp);
 	/*
 	 * Adding to the vnode counts before calling open
 	 * avoids the need for a mutex. It circumvents a race
 	 * condition where a query made on the vnode counts results in a
 	 * false negative. The inquirer goes away believing the file is
 	 * not open when there is an open on the file already under way.
 	 *
 	 * The counts are meant to prevent NFS from granting a delegation
 	 * when it would be dangerous to do so.
 	 *
 	 * The vnode counts are only kept on regular files
 	 */
 	if ((*vpp)->v_type == VREG) {
 		if (mode & FREAD)
 			atomic_inc_32(&(*vpp)->v_rdcnt);
 		if (mode & FWRITE)
 			atomic_inc_32(&(*vpp)->v_wrcnt);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	ret = (*(*(vpp))->v_op->vop_open)(vpp, mode, cr, ct);
 
 	if (ret) {
 		/*
 		 * Use the saved vp just in case the vnode ptr got trashed
 		 * by the error.
 		 */
 		VOPSTATS_UPDATE(vp, open);
 		if ((vp->v_type == VREG) && (mode & FREAD))
 			atomic_dec_32(&vp->v_rdcnt);
 		if ((vp->v_type == VREG) && (mode & FWRITE))
 			atomic_dec_32(&vp->v_wrcnt);
 	} else {
 		/*
 		 * Some filesystems will return a different vnode,
 		 * but the same path was still used to open it.
 		 * So if we do change the vnode and need to
 		 * copy over the path, do so here, rather than special
 		 * casing each filesystem. Adjust the vnode counts to
 		 * reflect the vnode switch.
 		 */
 		VOPSTATS_UPDATE(*vpp, open);
 		if (*vpp != vp && *vpp != NULL) {
 			vn_copypath(vp, *vpp);
 			if (((*vpp)->v_type == VREG) && (mode & FREAD))
 				atomic_inc_32(&(*vpp)->v_rdcnt);
 			if ((vp->v_type == VREG) && (mode & FREAD))
 				atomic_dec_32(&vp->v_rdcnt);
 			if (((*vpp)->v_type == VREG) && (mode & FWRITE))
 				atomic_inc_32(&(*vpp)->v_wrcnt);
 			if ((vp->v_type == VREG) && (mode & FWRITE))
 				atomic_dec_32(&vp->v_wrcnt);
 		}
 	}
 	VN_RELE(vp);
 	return (ret);
 }
 
 int
 fop_close(
 	vnode_t *vp,
 	int flag,
 	int count,
 	offset_t offset,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_close)(vp, flag, count, offset, cr, ct);
 	VOPSTATS_UPDATE(vp, close);
 	/*
 	 * Check passed in count to handle possible dups. Vnode counts are only
 	 * kept on regular files
 	 */
 	if ((vp->v_type == VREG) && (count == 1))  {
 		if (flag & FREAD) {
 			ASSERT(vp->v_rdcnt > 0);
 			atomic_dec_32(&vp->v_rdcnt);
 		}
 		if (flag & FWRITE) {
 			ASSERT(vp->v_wrcnt > 0);
 			atomic_dec_32(&vp->v_wrcnt);
 		}
 	}
 	return (err);
 }
 
 int
 fop_read(
 	vnode_t *vp,
 	uio_t *uiop,
 	int ioflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
 	VOPSTATS_UPDATE_IO(vp, read,
 	    read_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_write(
 	vnode_t *vp,
 	uio_t *uiop,
 	int ioflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
 	VOPSTATS_UPDATE_IO(vp, write,
 	    write_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_ioctl(
 	vnode_t *vp,
 	int cmd,
 	intptr_t arg,
 	int flag,
 	cred_t *cr,
 	int *rvalp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
 	VOPSTATS_UPDATE(vp, ioctl);
 	return (err);
 }
 
 int
 fop_setfl(
 	vnode_t *vp,
 	int oflags,
 	int nflags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_setfl)(vp, oflags, nflags, cr, ct);
 	VOPSTATS_UPDATE(vp, setfl);
 	return (err);
 }
 
 int
 fop_getattr(
 	vnode_t *vp,
 	vattr_t *vap,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * If this file system doesn't understand the xvattr extensions
 	 * then turn off the xvattr bit.
 	 */
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
 		vap->va_mask &= ~AT_XVATTR;
 	}
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flags & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_getattr)(vp, vap, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, getattr);
 	return (err);
 }
 
 int
 fop_setattr(
 	vnode_t *vp,
 	vattr_t *vap,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * If this file system doesn't understand the xvattr extensions
 	 * then turn off the xvattr bit.
 	 */
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR) == 0) {
 		vap->va_mask &= ~AT_XVATTR;
 	}
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flags & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_setattr)(vp, vap, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, setattr);
 	return (err);
 }
 
 int
 fop_access(
 	vnode_t *vp,
 	int mode,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	if ((flags & V_ACE_MASK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_access)(vp, mode, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, access);
 	return (err);
 }
 
 int
 fop_lookup(
 	vnode_t *dvp,
 	char *nm,
 	vnode_t **vpp,
 	pathname_t *pnp,
 	int flags,
 	vnode_t *rdir,
 	cred_t *cr,
 	caller_context_t *ct,
 	int *deflags,		/* Returned per-dirent flags */
 	pathname_t *ppnp)	/* Returned case-preserved name in directory */
 {
 	int ret;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.  It is required
 	 * that if the vfs supports case-insensitive lookup, it also
 	 * supports extended dirent flags.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	if ((flags & LOOKUP_XATTR) && (flags & LOOKUP_HAVE_SYSATTR_DIR) == 0) {
 		ret = xattr_dir_lookup(dvp, vpp, flags, cr);
 	} else {
 		ret = (*(dvp)->v_op->vop_lookup)
 		    (dvp, nm, vpp, pnp, flags, rdir, cr, ct, deflags, ppnp);
 	}
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, lookup);
 		vn_updatepath(dvp, *vpp, nm);
 	}
 
 	return (ret);
 }
 
 int
 fop_create(
 	vnode_t *dvp,
 	char *name,
 	vattr_t *vap,
 	vcexcl_t excl,
 	int mode,
 	vnode_t **vpp,
 	cred_t *cr,
 	int flags,
 	caller_context_t *ct,
 	vsecattr_t *vsecp)	/* ACL to set during create */
 {
 	int ret;
 
 	if (vsecp != NULL &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
 		return (EINVAL);
 	}
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	ret = (*(dvp)->v_op->vop_create)
 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, create);
 		vn_updatepath(dvp, *vpp, name);
 	}
 
 	return (ret);
 }
 
 int
 fop_remove(
 	vnode_t *dvp,
 	char *nm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	err = (*(dvp)->v_op->vop_remove)(dvp, nm, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, remove);
 	return (err);
 }
 
 int
 fop_link(
 	vnode_t *tdvp,
 	vnode_t *svp,
 	char *tnm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If the target file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(tdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(tdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(tdvp, cr);
 
 	err = (*(tdvp)->v_op->vop_link)(tdvp, svp, tnm, cr, ct, flags);
 	VOPSTATS_UPDATE(tdvp, link);
 	return (err);
 }
 
 int
 fop_rename(
 	vnode_t *sdvp,
 	char *snm,
 	vnode_t *tdvp,
 	char *tnm,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If the file system involved does not support
 	 * case-insensitive access and said access is requested, fail
 	 * quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    ((vfs_has_feature(sdvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(sdvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0)))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(tdvp, cr);
 
 	err = (*(sdvp)->v_op->vop_rename)(sdvp, snm, tdvp, tnm, cr, ct, flags);
 	VOPSTATS_UPDATE(sdvp, rename);
 	return (err);
 }
 
 int
 fop_mkdir(
 	vnode_t *dvp,
 	char *dirname,
 	vattr_t *vap,
 	vnode_t **vpp,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags,
 	vsecattr_t *vsecp)	/* ACL to set during create */
 {
 	int ret;
 
 	if (vsecp != NULL &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_ACLONCREATE) == 0) {
 		return (EINVAL);
 	}
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	ret = (*(dvp)->v_op->vop_mkdir)
 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, mkdir);
 		vn_updatepath(dvp, *vpp, dirname);
 	}
 
 	return (ret);
 }
 
 int
 fop_rmdir(
 	vnode_t *dvp,
 	char *nm,
 	vnode_t *cdir,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	err = (*(dvp)->v_op->vop_rmdir)(dvp, nm, cdir, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, rmdir);
 	return (err);
 }
 
 int
 fop_readdir(
 	vnode_t *vp,
 	uio_t *uiop,
 	cred_t *cr,
 	int *eofp,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 	ssize_t	resid_start = uiop->uio_resid;
 
 	/*
 	 * If this file system doesn't support retrieving directory
 	 * entry flags and said access is requested, fail quickly.
 	 */
 	if (flags & V_RDDIR_ENTFLAGS &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_DIRENTFLAGS) == 0)
 		return (EINVAL);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_readdir)(vp, uiop, cr, eofp, ct, flags);
 	VOPSTATS_UPDATE_IO(vp, readdir,
 	    readdir_bytes, (resid_start - uiop->uio_resid));
 	return (err);
 }
 
 int
 fop_symlink(
 	vnode_t *dvp,
 	char *linkname,
 	vattr_t *vap,
 	char *target,
 	cred_t *cr,
 	caller_context_t *ct,
 	int flags)
 {
 	int	err;
 	xvattr_t xvattr;
 
 	/*
 	 * If this file system doesn't support case-insensitive access
 	 * and said access is requested, fail quickly.
 	 */
 	if (flags & FIGNORECASE &&
 	    (vfs_has_feature(dvp->v_vfsp, VFSFT_CASEINSENSITIVE) == 0 &&
 	    vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) == 0))
 		return (EINVAL);
 
 	VOPXID_MAP_CR(dvp, cr);
 
 	/* check for reparse point */
 	if ((vfs_has_feature(dvp->v_vfsp, VFSFT_REPARSE)) &&
 	    (strncmp(target, FS_REPARSE_TAG_STR,
 	    strlen(FS_REPARSE_TAG_STR)) == 0)) {
 		if (!fs_reparse_mark(target, vap, &xvattr))
 			vap = (vattr_t *)&xvattr;
 	}
 
 	err = (*(dvp)->v_op->vop_symlink)
 	    (dvp, linkname, vap, target, cr, ct, flags);
 	VOPSTATS_UPDATE(dvp, symlink);
 	return (err);
 }
 
 int
 fop_readlink(
 	vnode_t *vp,
 	uio_t *uiop,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_readlink)(vp, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, readlink);
 	return (err);
 }
 
 int
 fop_fsync(
 	vnode_t *vp,
 	int syncflag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_fsync)(vp, syncflag, cr, ct);
 	VOPSTATS_UPDATE(vp, fsync);
 	return (err);
 }
 
 void
 fop_inactive(
 	vnode_t *vp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	/* Need to update stats before vop call since we may lose the vnode */
 	VOPSTATS_UPDATE(vp, inactive);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	(*(vp)->v_op->vop_inactive)(vp, cr, ct);
 }
 
 int
 fop_fid(
 	vnode_t *vp,
 	fid_t *fidp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_fid)(vp, fidp, ct);
 	VOPSTATS_UPDATE(vp, fid);
 	return (err);
 }
 
 int
 fop_rwlock(
 	vnode_t *vp,
 	int write_lock,
 	caller_context_t *ct)
 {
 	int	ret;
 
 	ret = ((*(vp)->v_op->vop_rwlock)(vp, write_lock, ct));
 	VOPSTATS_UPDATE(vp, rwlock);
 	return (ret);
 }
 
 void
 fop_rwunlock(
 	vnode_t *vp,
 	int write_lock,
 	caller_context_t *ct)
 {
 	(*(vp)->v_op->vop_rwunlock)(vp, write_lock, ct);
 	VOPSTATS_UPDATE(vp, rwunlock);
 }
 
 int
 fop_seek(
 	vnode_t *vp,
 	offset_t ooff,
 	offset_t *noffp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_seek)(vp, ooff, noffp, ct);
 	VOPSTATS_UPDATE(vp, seek);
 	return (err);
 }
 
 int
 fop_cmp(
 	vnode_t *vp1,
 	vnode_t *vp2,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp1)->v_op->vop_cmp)(vp1, vp2, ct);
 	VOPSTATS_UPDATE(vp1, cmp);
 	return (err);
 }
 
 int
 fop_frlock(
 	vnode_t *vp,
 	int cmd,
 	flock64_t *bfp,
 	int flag,
 	offset_t offset,
 	struct flk_callback *flk_cbp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_frlock)
 	    (vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 	VOPSTATS_UPDATE(vp, frlock);
 	return (err);
 }
 
 int
 fop_space(
 	vnode_t *vp,
 	int cmd,
 	flock64_t *bfp,
 	int flag,
 	offset_t offset,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_space)(vp, cmd, bfp, flag, offset, cr, ct);
 	VOPSTATS_UPDATE(vp, space);
 	return (err);
 }
 
 int
 fop_realvp(
 	vnode_t *vp,
 	vnode_t **vpp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_realvp)(vp, vpp, ct);
 	VOPSTATS_UPDATE(vp, realvp);
 	return (err);
 }
 
 int
 fop_getpage(
 	vnode_t *vp,
 	offset_t off,
 	size_t len,
 	uint_t *protp,
 	page_t **plarr,
 	size_t plsz,
 	struct seg *seg,
 	caddr_t addr,
 	enum seg_rw rw,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_getpage)
 	    (vp, off, len, protp, plarr, plsz, seg, addr, rw, cr, ct);
 	VOPSTATS_UPDATE(vp, getpage);
 	return (err);
 }
 
 int
 fop_putpage(
 	vnode_t *vp,
 	offset_t off,
 	size_t len,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_putpage)(vp, off, len, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, putpage);
 	return (err);
 }
 
 int
 fop_map(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t *addrp,
 	size_t len,
 	uchar_t prot,
 	uchar_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_map)
 	    (vp, off, as, addrp, len, prot, maxprot, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, map);
 	return (err);
 }
 
 int
 fop_addmap(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t addr,
 	size_t len,
 	uchar_t prot,
 	uchar_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int error;
 	u_longlong_t delta;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	error = (*(vp)->v_op->vop_addmap)
 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
 
 	if ((!error) && (vp->v_type == VREG)) {
 		delta = (u_longlong_t)btopr(len);
 		/*
 		 * If file is declared MAP_PRIVATE, it can't be written back
 		 * even if open for write. Handle as read.
 		 */
 		if (flags & MAP_PRIVATE) {
 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 			    (int64_t)delta);
 		} else {
 			/*
 			 * atomic_add_64 forces the fetch of a 64 bit value to
 			 * be atomic on 32 bit machines
 			 */
 			if (maxprot & PROT_WRITE)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
 				    (int64_t)delta);
 			if (maxprot & PROT_READ)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)delta);
 			if (maxprot & PROT_EXEC)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)delta);
 		}
 	}
 	VOPSTATS_UPDATE(vp, addmap);
 	return (error);
 }
 
 int
 fop_delmap(
 	vnode_t *vp,
 	offset_t off,
 	struct as *as,
 	caddr_t addr,
 	size_t len,
 	uint_t prot,
 	uint_t maxprot,
 	uint_t flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int error;
 	u_longlong_t delta;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	error = (*(vp)->v_op->vop_delmap)
 	    (vp, off, as, addr, len, prot, maxprot, flags, cr, ct);
 
 	/*
 	 * NFS calls into delmap twice, the first time
 	 * it simply establishes a callback mechanism and returns EAGAIN
 	 * while the real work is being done upon the second invocation.
 	 * We have to detect this here and only decrement the counts upon
 	 * the second delmap request.
 	 */
 	if ((error != EAGAIN) && (vp->v_type == VREG)) {
 
 		delta = (u_longlong_t)btopr(len);
 
 		if (flags & MAP_PRIVATE) {
 			atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 			    (int64_t)(-delta));
 		} else {
 			/*
 			 * atomic_add_64 forces the fetch of a 64 bit value
 			 * to be atomic on 32 bit machines
 			 */
 			if (maxprot & PROT_WRITE)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_write)),
 				    (int64_t)(-delta));
 			if (maxprot & PROT_READ)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)(-delta));
 			if (maxprot & PROT_EXEC)
 				atomic_add_64((uint64_t *)(&(vp->v_mmap_read)),
 				    (int64_t)(-delta));
 		}
 	}
 	VOPSTATS_UPDATE(vp, delmap);
 	return (error);
 }
 
 
 int
 fop_poll(
 	vnode_t *vp,
 	short events,
 	int anyyet,
 	short *reventsp,
 	struct pollhead **phpp,
 	caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_poll)(vp, events, anyyet, reventsp, phpp, ct);
 	VOPSTATS_UPDATE(vp, poll);
 	return (err);
 }
 
 int
 fop_dump(
 	vnode_t *vp,
 	caddr_t addr,
 	offset_t lbdn,
 	offset_t dblks,
 	caller_context_t *ct)
 {
 	int	err;
 
 	/* ensure lbdn and dblks can be passed safely to bdev_dump */
 	if ((lbdn != (daddr_t)lbdn) || (dblks != (int)dblks))
 		return (EIO);
 
 	err = (*(vp)->v_op->vop_dump)(vp, addr, lbdn, dblks, ct);
 	VOPSTATS_UPDATE(vp, dump);
 	return (err);
 }
 
 int
 fop_pathconf(
 	vnode_t *vp,
 	int cmd,
 	ulong_t *valp,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_pathconf)(vp, cmd, valp, cr, ct);
 	VOPSTATS_UPDATE(vp, pathconf);
 	return (err);
 }
 
 int
 fop_pageio(
 	vnode_t *vp,
 	struct page *pp,
 	u_offset_t io_off,
 	size_t io_len,
 	int flags,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_pageio)(vp, pp, io_off, io_len, flags, cr, ct);
 	VOPSTATS_UPDATE(vp, pageio);
 	return (err);
 }
 
 int
 fop_dumpctl(
 	vnode_t *vp,
 	int action,
 	offset_t *blkp,
 	caller_context_t *ct)
 {
 	int	err;
 	err = (*(vp)->v_op->vop_dumpctl)(vp, action, blkp, ct);
 	VOPSTATS_UPDATE(vp, dumpctl);
 	return (err);
 }
 
 void
 fop_dispose(
 	vnode_t *vp,
 	page_t *pp,
 	int flag,
 	int dn,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	/* Must do stats first since it's possible to lose the vnode */
 	VOPSTATS_UPDATE(vp, dispose);
 
 	VOPXID_MAP_CR(vp, cr);
 
 	(*(vp)->v_op->vop_dispose)(vp, pp, flag, dn, cr, ct);
 }
 
 int
 fop_setsecattr(
 	vnode_t *vp,
 	vsecattr_t *vsap,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flag & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 	err = (*(vp)->v_op->vop_setsecattr) (vp, vsap, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, setsecattr);
 	return (err);
 }
 
 int
 fop_getsecattr(
 	vnode_t *vp,
 	vsecattr_t *vsap,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	/*
 	 * We're only allowed to skip the ACL check iff we used a 32 bit
 	 * ACE mask with VOP_ACCESS() to determine permissions.
 	 */
 	if ((flag & ATTR_NOACLCHECK) &&
 	    vfs_has_feature(vp->v_vfsp, VFSFT_ACEMASKONACCESS) == 0) {
 		return (EINVAL);
 	}
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_getsecattr) (vp, vsap, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, getsecattr);
 	return (err);
 }
 
 int
 fop_shrlock(
 	vnode_t *vp,
 	int cmd,
 	struct shrlock *shr,
 	int flag,
 	cred_t *cr,
 	caller_context_t *ct)
 {
 	int	err;
 
 	VOPXID_MAP_CR(vp, cr);
 
 	err = (*(vp)->v_op->vop_shrlock)(vp, cmd, shr, flag, cr, ct);
 	VOPSTATS_UPDATE(vp, shrlock);
 	return (err);
 }
 
 int
 fop_vnevent(vnode_t *vp, vnevent_t vnevent, vnode_t *dvp, char *fnm,
     caller_context_t *ct)
 {
 	int	err;
 
 	err = (*(vp)->v_op->vop_vnevent)(vp, vnevent, dvp, fnm, ct);
 	VOPSTATS_UPDATE(vp, vnevent);
 	return (err);
 }
 
 int
 fop_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *uiop, cred_t *cr,
     caller_context_t *ct)
 {
 	int err;
 
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
 		return (ENOTSUP);
 	err = (*(vp)->v_op->vop_reqzcbuf)(vp, ioflag, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, reqzcbuf);
 	return (err);
 }
 
 int
 fop_retzcbuf(vnode_t *vp, xuio_t *uiop, cred_t *cr, caller_context_t *ct)
 {
 	int err;
 
 	if (vfs_has_feature(vp->v_vfsp, VFSFT_ZEROCOPY_SUPPORTED) == 0)
 		return (ENOTSUP);
 	err = (*(vp)->v_op->vop_retzcbuf)(vp, uiop, cr, ct);
 	VOPSTATS_UPDATE(vp, retzcbuf);
 	return (err);
 }
 
 /*
  * Default destructor
  *	Needed because NULL destructor means that the key is unused
  */
 /* ARGSUSED */
 void
 vsd_defaultdestructor(void *value)
 {}
 
 /*
  * Create a key (index into per vnode array)
  *	Locks out vsd_create, vsd_destroy, and vsd_free
  *	May allocate memory with lock held
  */
 void
 vsd_create(uint_t *keyp, void (*destructor)(void *))
 {
 	int	i;
 	uint_t	nkeys;
 
 	/*
 	 * if key is allocated, do nothing
 	 */
 	mutex_enter(&vsd_lock);
 	if (*keyp) {
 		mutex_exit(&vsd_lock);
 		return;
 	}
 	/*
 	 * find an unused key
 	 */
 	if (destructor == NULL)
 		destructor = vsd_defaultdestructor;
 
 	for (i = 0; i < vsd_nkeys; ++i)
 		if (vsd_destructor[i] == NULL)
 			break;
 
 	/*
 	 * if no unused keys, increase the size of the destructor array
 	 */
 	if (i == vsd_nkeys) {
 		if ((nkeys = (vsd_nkeys << 1)) == 0)
 			nkeys = 1;
 		vsd_destructor =
 		    (void (**)(void *))vsd_realloc((void *)vsd_destructor,
 		    (size_t)(vsd_nkeys * sizeof (void (*)(void *))),
 		    (size_t)(nkeys * sizeof (void (*)(void *))));
 		vsd_nkeys = nkeys;
 	}
 
 	/*
 	 * allocate the next available unused key
 	 */
 	vsd_destructor[i] = destructor;
 	*keyp = i + 1;
 
 	/* create vsd_list, if it doesn't exist */
 	if (vsd_list == NULL) {
 		vsd_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 		list_create(vsd_list, sizeof (struct vsd_node),
 		    offsetof(struct vsd_node, vs_nodes));
 	}
 
 	mutex_exit(&vsd_lock);
 }
 
 /*
  * Destroy a key
  *
  * Assumes that the caller is preventing vsd_set and vsd_get
  * Locks out vsd_create, vsd_destroy, and vsd_free
  * May free memory with lock held
  */
 void
 vsd_destroy(uint_t *keyp)
 {
 	uint_t key;
 	struct vsd_node *vsd;
 
 	/*
 	 * protect the key namespace and our destructor lists
 	 */
 	mutex_enter(&vsd_lock);
 	key = *keyp;
 	*keyp = 0;
 
 	ASSERT(key <= vsd_nkeys);
 
 	/*
 	 * if the key is valid
 	 */
 	if (key != 0) {
 		uint_t k = key - 1;
 		/*
 		 * for every vnode with VSD, call key's destructor
 		 */
 		for (vsd = list_head(vsd_list); vsd != NULL;
 		    vsd = list_next(vsd_list, vsd)) {
 			/*
 			 * no VSD for key in this vnode
 			 */
 			if (key > vsd->vs_nkeys)
 				continue;
 			/*
 			 * call destructor for key
 			 */
 			if (vsd->vs_value[k] && vsd_destructor[k])
 				(*vsd_destructor[k])(vsd->vs_value[k]);
 			/*
 			 * reset value for key
 			 */
 			vsd->vs_value[k] = NULL;
 		}
 		/*
 		 * actually free the key (NULL destructor == unused)
 		 */
 		vsd_destructor[k] = NULL;
 	}
 
 	mutex_exit(&vsd_lock);
 }
 
 /*
  * Quickly return the per vnode value that was stored with the specified key
  * Assumes the caller is protecting key from vsd_create and vsd_destroy
  * Assumes the caller is holding v_vsd_lock to protect the vsd.
  */
 void *
 vsd_get(vnode_t *vp, uint_t key)
 {
 	struct vsd_node *vsd;
 
 	ASSERT(vp != NULL);
 	ASSERT(mutex_owned(&vp->v_vsd_lock));
 
 	vsd = vp->v_vsd;
 
 	if (key && vsd != NULL && key <= vsd->vs_nkeys)
 		return (vsd->vs_value[key - 1]);
 	return (NULL);
 }
 
 /*
  * Set a per vnode value indexed with the specified key
  * Assumes the caller is holding v_vsd_lock to protect the vsd.
  */
 int
 vsd_set(vnode_t *vp, uint_t key, void *value)
 {
 	struct vsd_node *vsd;
 
 	ASSERT(vp != NULL);
 	ASSERT(mutex_owned(&vp->v_vsd_lock));
 
 	if (key == 0)
 		return (EINVAL);
 
 	vsd = vp->v_vsd;
 	if (vsd == NULL)
 		vsd = vp->v_vsd = kmem_zalloc(sizeof (*vsd), KM_SLEEP);
 
 	/*
 	 * If the vsd was just allocated, vs_nkeys will be 0, so the following
 	 * code won't happen and we will continue down and allocate space for
 	 * the vs_value array.
 	 * If the caller is replacing one value with another, then it is up
 	 * to the caller to free/rele/destroy the previous value (if needed).
 	 */
 	if (key <= vsd->vs_nkeys) {
 		vsd->vs_value[key - 1] = value;
 		return (0);
 	}
 
 	ASSERT(key <= vsd_nkeys);
 
 	if (vsd->vs_nkeys == 0) {
 		mutex_enter(&vsd_lock);	/* lock out vsd_destroy() */
 		/*
 		 * Link onto list of all VSD nodes.
 		 */
 		list_insert_head(vsd_list, vsd);
 		mutex_exit(&vsd_lock);
 	}
 
 	/*
 	 * Allocate vnode local storage and set the value for key
 	 */
 	vsd->vs_value = vsd_realloc(vsd->vs_value,
 	    vsd->vs_nkeys * sizeof (void *),
 	    key * sizeof (void *));
 	vsd->vs_nkeys = key;
 	vsd->vs_value[key - 1] = value;
 
 	return (0);
 }
 
 /*
  * Called from vn_free() to run the destructor function for each vsd
  *	Locks out vsd_create and vsd_destroy
  *	Assumes that the destructor *DOES NOT* use vsd
  */
 void
 vsd_free(vnode_t *vp)
 {
 	int i;
 	struct vsd_node *vsd = vp->v_vsd;
 
 	if (vsd == NULL)
 		return;
 
 	if (vsd->vs_nkeys == 0) {
 		kmem_free(vsd, sizeof (*vsd));
 		vp->v_vsd = NULL;
 		return;
 	}
 
 	/*
 	 * lock out vsd_create and vsd_destroy, call
 	 * the destructor, and mark the value as destroyed.
 	 */
 	mutex_enter(&vsd_lock);
 
 	for (i = 0; i < vsd->vs_nkeys; i++) {
 		if (vsd->vs_value[i] && vsd_destructor[i])
 			(*vsd_destructor[i])(vsd->vs_value[i]);
 		vsd->vs_value[i] = NULL;
 	}
 
 	/*
 	 * remove from linked list of VSD nodes
 	 */
 	list_remove(vsd_list, vsd);
 
 	mutex_exit(&vsd_lock);
 
 	/*
 	 * free up the VSD
 	 */
 	kmem_free(vsd->vs_value, vsd->vs_nkeys * sizeof (void *));
 	kmem_free(vsd, sizeof (struct vsd_node));
 	vp->v_vsd = NULL;
 }
 
 /*
  * realloc
  */
 static void *
 vsd_realloc(void *old, size_t osize, size_t nsize)
 {
 	void *new;
 
 	new = kmem_zalloc(nsize, KM_SLEEP);
 	if (old) {
 		bcopy(old, new, osize);
 		kmem_free(old, osize);
 	}
 	return (new);
 }
 
 /*
  * Setup the extensible system attribute for creating a reparse point.
  * The symlink data 'target' is validated for proper format of a reparse
  * string and a check also made to make sure the symlink data does not
  * point to an existing file.
  *
  * return 0 if ok else -1.
  */
 static int
 fs_reparse_mark(char *target, vattr_t *vap, xvattr_t *xvattr)
 {
 	xoptattr_t *xoap;
 
 	if ((!target) || (!vap) || (!xvattr))
 		return (-1);
 
 	/* validate reparse string */
 	if (reparse_validate((const char *)target))
 		return (-1);
 
 	xva_init(xvattr);
 	xvattr->xva_vattr = *vap;
 	xvattr->xva_vattr.va_mask |= AT_XVATTR;
 	xoap = xva_getxoptattr(xvattr);
 	ASSERT(xoap);
 	XVA_SET_REQ(xvattr, XAT_REPARSE);
 	xoap->xoa_reparse = 1;
 
 	return (0);
 }
 
 /*
  * Function to check whether a symlink is a reparse point.
  * Return B_TRUE if it is a reparse point, else return B_FALSE
  */
 boolean_t
 vn_is_reparse(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	xvattr_t xvattr;
 	xoptattr_t *xoap;
 
 	if ((vp->v_type != VLNK) ||
 	    !(vfs_has_feature(vp->v_vfsp, VFSFT_XVATTR)))
 		return (B_FALSE);
 
 	xva_init(&xvattr);
 	xoap = xva_getxoptattr(&xvattr);
 	ASSERT(xoap);
 	XVA_SET_REQ(&xvattr, XAT_REPARSE);
 
 	if (VOP_GETATTR(vp, &xvattr.xva_vattr, 0, cr, ct))
 		return (B_FALSE);
 
 	if ((!(xvattr.xva_vattr.va_mask & AT_XVATTR)) ||
 	    (!(XVA_ISSET_RTN(&xvattr, XAT_REPARSE))))
 		return (B_FALSE);
 
 	return (xoap->xoa_reparse ? B_TRUE : B_FALSE);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c	(revision 354953)
@@ -1,705 +1,705 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/dmu_impl.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/callb.h>
 #include <sys/zfeature.h>
 
 int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
 boolean_t send_holes_without_birth_time = B_TRUE;
 
 typedef struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
 	int32_t pd_bytes_fetched;
 	int pd_flags;
 	boolean_t pd_cancel;
 	boolean_t pd_exited;
 	zbookmark_phys_t pd_resume;
 } prefetch_data_t;
 
 typedef struct traverse_data {
 	spa_t *td_spa;
 	uint64_t td_objset;
 	blkptr_t *td_rootbp;
 	uint64_t td_min_txg;
 	zbookmark_phys_t *td_resume;
 	int td_flags;
 	prefetch_data_t *td_pfd;
 	boolean_t td_paused;
 	uint64_t td_hole_birth_enabled_txg;
 	blkptr_cb_t *td_func;
 	void *td_arg;
 	boolean_t td_realloc_possible;
 } traverse_data_t;
 
 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object);
 static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
     uint64_t objset, uint64_t object);
 
 static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 	zbookmark_phys_t zb;
 
 	if (BP_IS_HOLE(bp))
 		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
 
 	return (0);
 }
 
 static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	traverse_data_t *td = arg;
 
 	if (lrc->lrc_txtype == TX_WRITE) {
 		lr_write_t *lr = (lr_write_t *)lrc;
 		blkptr_t *bp = &lr->lr_blkptr;
 		zbookmark_phys_t zb;
 
 		if (BP_IS_HOLE(bp))
 			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
 			return (0);
 
 		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
 		    td->td_arg);
 	}
 	return (0);
 }
 
 static void
 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 {
 	uint64_t claim_txg = zh->zh_claim_txg;
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed; plus blocks that are already stable in read-only mode.
 	 */
 	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 	    claim_txg);
 	zil_free(zilog);
 }
 
 typedef enum resume_skip {
 	RESUME_SKIP_ALL,
 	RESUME_SKIP_NONE,
 	RESUME_SKIP_CHILDREN
 } resume_skip_t;
 
 /*
  * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
  * the block indicated by zb does not need to be visited at all. Returns
  * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
  * resume point. This indicates that this block should be visited but not its
  * children (since they must have been visited in a previous traversal).
  * Otherwise returns RESUME_SKIP_NONE.
  */
 static resume_skip_t
 resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
     const zbookmark_phys_t *zb)
 {
 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
 		/*
 		 * If we already visited this bp & everything below,
 		 * don't bother doing it again.
 		 */
 		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
 			return (RESUME_SKIP_ALL);
 
 		/*
 		 * If we found the block we're trying to resume from, zero
 		 * the bookmark out to indicate that we have resumed.
 		 */
 		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
 			bzero(td->td_resume, sizeof (*zb));
 			if (td->td_flags & TRAVERSE_POST)
 				return (RESUME_SKIP_CHILDREN);
 		}
 	}
 	return (RESUME_SKIP_NONE);
 }
 
 static void
 traverse_prefetch_metadata(traverse_data_t *td,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
 		return;
 	/*
 	 * If we are in the process of resuming, don't prefetch, because
 	 * some children will not be needed (and in fact may have already
 	 * been freed).
 	 */
 	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
 		return;
 	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
 		return;
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return;
 
 	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 }
 
 static boolean_t
 prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
 {
 	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
 	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 		return (B_FALSE);
 	return (B_TRUE);
 }
 
 static int
 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	zbookmark_phys_t czb;
 	int err = 0;
 	arc_buf_t *buf = NULL;
 	prefetch_data_t *pd = td->td_pfd;
 	boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
 	switch (resume_skip_check(td, dnp, zb)) {
 	case RESUME_SKIP_ALL:
 		return (0);
 	case RESUME_SKIP_CHILDREN:
 		goto post;
 	case RESUME_SKIP_NONE:
 		break;
 	default:
 		ASSERT(0);
 	}
 
 	if (bp->blk_birth == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
 		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
 		 * which has always been a hole in an object.
 		 *
 		 * If a file is written sparsely, then the unwritten parts of
 		 * the file were "always holes" -- that is, they have been
 		 * holes since this object was allocated.  However, we (and
 		 * our callers) can not necessarily tell when an object was
 		 * allocated.  Therefore, if it's possible that this object
 		 * was freed and then its object number reused, we need to
 		 * visit all the holes with birth==0.
 		 *
 		 * If it isn't possible that the object number was reused,
 		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
 		 * all the blocks we will visit as part of this traversal,
 		 * then this hole must have always existed, so we can skip
 		 * it.  We visit blocks born after (exclusive) td_min_txg.
 		 *
 		 * Note that the meta-dnode cannot be reallocated.
 		 */
 		if (!send_holes_without_birth_time &&
 		    (!td->td_realloc_possible ||
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
 	} else if (bp->blk_birth <= td->td_min_txg) {
 		return (0);
 	}
 
 	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
 		uint64_t size = BP_GET_LSIZE(bp);
 		mutex_enter(&pd->pd_mtx);
 		ASSERT(pd->pd_bytes_fetched >= 0);
 		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
 			cv_wait(&pd->pd_cv, &pd->pd_mtx);
 		pd->pd_bytes_fetched -= size;
 		cv_broadcast(&pd->pd_cv);
 		mutex_exit(&pd->pd_mtx);
 	}
 
 	if (BP_IS_HOLE(bp)) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err != 0)
 			goto post;
 		return (0);
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			goto post;
 	}
 
 	if (BP_GET_LEVEL(bp) > 0) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 		cbp = buf->b_data;
 
 		for (i = 0; i < epb; i++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			traverse_prefetch_metadata(td, &cbp[i], &czb);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i++) {
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 		dnode_phys_t *child_dnp = buf->b_data;
 
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			prefetch_dnode_metadata(td, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 		}
 
 		/* recursively visitbp() blocks below this */
 		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
 			err = traverse_dnode(td, &child_dnp[i],
 			    zb->zb_objset, zb->zb_blkid * epb + i);
 			if (err != 0)
 				break;
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 
 		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err != 0)
 			goto post;
 
 		objset_phys_t *osp = buf->b_data;
 		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		/*
 		 * See the block comment above for the goal of this variable.
 		 * If the maxblkid of the meta-dnode is 0, then we know that
 		 * we've never had more than DNODES_PER_BLOCK objects in the
 		 * dataset, which means we can't have reused any object ids.
 		 */
 		if (osp->os_meta_dnode.dn_maxblkid == 0)
 			td->td_realloc_possible = B_FALSE;
 
 		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
 			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 
 		err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
 		    DMU_META_DNODE_OBJECT);
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			err = traverse_dnode(td, &osp->os_groupused_dnode,
 			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
 		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 			err = traverse_dnode(td, &osp->os_userused_dnode,
 			    zb->zb_objset, DMU_USERUSED_OBJECT);
 		}
 	}
 
 	if (buf)
 		arc_buf_destroy(buf, &buf);
 
 post:
 	if (err == 0 && (td->td_flags & TRAVERSE_POST))
 		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
 	if (hard && (err == EIO || err == ECKSUM)) {
 		/*
 		 * Ignore this disk error as requested by the HARD flag,
 		 * and continue traversal.
 		 */
 		err = 0;
 	}
 
 	/*
 	 * If we are stopping here, set td_resume.
 	 */
 	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
 		td->td_resume->zb_objset = zb->zb_objset;
 		td->td_resume->zb_object = zb->zb_object;
 		td->td_resume->zb_level = 0;
 		/*
 		 * If we have stopped on an indirect block (e.g. due to
 		 * i/o error), we have not visited anything below it.
 		 * Set the bookmark to the first level-0 block that we need
 		 * to visit.  This way, the resuming code does not need to
 		 * deal with resuming from indirect blocks.
 		 *
 		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
 		 * to dereference it.
 		 */
 		td->td_resume->zb_blkid = zb->zb_blkid;
 		if (zb->zb_level > 0) {
 			td->td_resume->zb_blkid <<= zb->zb_level *
 			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
 		}
 		td->td_paused = B_TRUE;
 	}
 
 	return (err);
 }
 
 static void
 prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j;
 	zbookmark_phys_t czb;
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 }
 
 static int
 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
     uint64_t objset, uint64_t object)
 {
 	int j, err = 0;
 	zbookmark_phys_t czb;
 
 	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
 	    object < td->td_resume->zb_object)
 		return (0);
 
 	if (td->td_flags & TRAVERSE_PRE) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
 		if (err != 0)
 			break;
 	}
 
 	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
 		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
 		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
 	}
 
 	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
 		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
 		    ZB_DNODE_BLKID);
 		err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
 		    td->td_arg);
 		if (err == TRAVERSE_VISIT_NO_CHILDREN)
 			return (0);
 		if (err != 0)
 			return (err);
 	}
 	return (err);
 }
 
 /* ARGSUSED */
 static int
 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	prefetch_data_t *pfd = arg;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
 	    ARC_FLAG_PRESCIENT_PREFETCH;
 
 	ASSERT(pfd->pd_bytes_fetched >= 0);
 	if (bp == NULL)
 		return (0);
 	if (pfd->pd_cancel)
 		return (SET_ERROR(EINTR));
 
 	if (!prefetch_needed(pfd, bp))
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
 	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
 		cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
 	cv_broadcast(&pfd->pd_cv);
 	mutex_exit(&pfd->pd_mtx);
 
 	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb);
 
 	return (0);
 }
 
 static void
 traverse_prefetch_thread(void *arg)
 {
 	traverse_data_t *td_main = arg;
 	traverse_data_t td = *td_main;
 	zbookmark_phys_t czb;
 
 	td.td_func = traverse_prefetcher;
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 	td.td_resume = &td_main->td_pfd->pd_resume;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
 	td_main->td_pfd->pd_exited = B_TRUE;
 	cv_broadcast(&td_main->td_pfd->pd_cv);
 	mutex_exit(&td_main->td_pfd->pd_mtx);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 static int
 traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	traverse_data_t td;
 	prefetch_data_t pd = { 0 };
 	zbookmark_phys_t czb;
 	int err;
 
 	ASSERT(ds == NULL || objset == ds->ds_object);
 	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
 
 	td.td_spa = spa;
 	td.td_objset = objset;
 	td.td_rootbp = rootbp;
 	td.td_min_txg = txg_start;
 	td.td_resume = resume;
 	td.td_func = func;
 	td.td_arg = arg;
 	td.td_pfd = &pd;
 	td.td_flags = flags;
 	td.td_paused = B_FALSE;
 	td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 		VERIFY(spa_feature_enabled_txg(spa,
 		    SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
 	} else {
 		td.td_hole_birth_enabled_txg = UINT64_MAX;
 	}
 
 	pd.pd_flags = flags;
 	if (resume != NULL)
 		pd.pd_resume = *resume;
 	mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 
 	/* See comment on ZIL traversal in dsl_scan_visitds. */
 	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		objset_phys_t *osp;
 		arc_buf_t *buf;
 
 		err = arc_read(NULL, td.td_spa, rootbp,
 		    arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
 		if (err != 0)
 			return (err);
 
 		osp = buf->b_data;
 		traverse_zil(&td, &osp->os_zil_header);
 		arc_buf_destroy(buf, &buf);
 	}
 
 	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
-	    0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
-	    &td, TQ_NOQUEUE))
+	    taskq_dispatch(system_taskq, traverse_prefetch_thread,
+	    &td, TQ_NOQUEUE) == TASKQID_INVALID)
 		pd.pd_exited = B_TRUE;
 
 	SET_BOOKMARK(&czb, td.td_objset,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	err = traverse_visitbp(&td, NULL, rootbp, &czb);
 
 	mutex_enter(&pd.pd_mtx);
 	pd.pd_cancel = B_TRUE;
 	cv_broadcast(&pd.pd_cv);
 	while (!pd.pd_exited)
 		cv_wait(&pd.pd_cv, &pd.pd_mtx);
 	mutex_exit(&pd.pd_mtx);
 
 	mutex_destroy(&pd.pd_mtx);
 	cv_destroy(&pd.pd_cv);
 
 	return (err);
 }
 
 /*
  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
  * in syncing context).
  */
 int
 traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
     zbookmark_phys_t *resume,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
 	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
 }
 
 int
 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
     int flags, blkptr_cb_t func, void *arg)
 {
 	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
 }
 
 int
 traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
     uint64_t txg_start, zbookmark_phys_t *resume, int flags,
     blkptr_cb_t func, void *arg)
 {
 	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
 	    blkptr, txg_start, resume, flags, func, arg));
 }
 
 /*
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
     blkptr_cb_t func, void *arg)
 {
 	int err;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
 	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
 	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
 	    txg_start, NULL, flags, func, arg);
 	if (err != 0)
 		return (err);
 
 	/* visit each dataset */
 	for (uint64_t obj = 1; err == 0;
 	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
 		if (err != 0) {
 			if (hard)
 				continue;
 			break;
 		}
 
 		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
 			uint64_t txg = txg_start;
 
 			dsl_pool_config_enter(dp, FTAG);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			dsl_pool_config_exit(dp, FTAG);
 			if (err != 0) {
 				if (hard)
 					continue;
 				break;
 			}
 			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
 				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
 			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err != 0)
 				break;
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
 	return (err);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	(revision 354953)
@@ -1,4492 +1,4492 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/zap.h>
 
 #define	GANG_ALLOCATION(flags) \
 	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
 
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
  * Since we can touch multiple metaslabs (and their respective space maps)
  * with each transaction group, we benefit from having a smaller space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk.
  */
 int zfs_metaslab_sm_blksz = (1 << 12);
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 int zfs_condense_pct = 200;
 
 /*
  * Condensing a metaslab is not guaranteed to actually reduce the amount of
  * space used on disk. In particular, a space map uses data in increments of
  * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
  * same number of blocks after condensing. Since the goal of condensing is to
  * reduce the number of IOPs required to read the space map, we only want to
  * condense when we can be sure we will reduce the number of blocks used by the
  * space map. Unfortunately, we cannot precisely compute whether or not this is
  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
  * we apply the following heuristic: do not condense a spacemap unless the
  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
  * blocks.
  */
 int zfs_metaslab_condense_block_threshold = 4;
 
 /*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * free space. Metaslab groups that have more free space than
  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
  * a metaslab group's free space is less than or equal to the
  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
  * groups are allowed to accept allocations. Gang blocks are always
  * eligible to allocate on any metaslab group. The default value of 0 means
  * no metaslab group will be excluded based on this criterion.
  */
 int zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab groups are considered eligible for allocations if their
  * fragmenation metric (measured as a percentage) is less than or equal to
  * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
  * then it will be skipped unless all metaslab groups within the metaslab
  * class have also crossed this threshold.
  */
 int zfs_mg_fragmentation_threshold = 85;
 
 /*
  * Allow metaslabs to keep their active state as long as their fragmentation
  * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
  * active metaslab that exceeds this threshold will no longer keep its active
  * status allowing better metaslabs to be selected.
  */
 int zfs_metaslab_fragmentation_threshold = 70;
 
 /*
  * When set will load all metaslabs when pool is first opened.
  */
 int metaslab_debug_load = 0;
 
 /*
  * When set will prevent metaslabs from being unloaded.
  */
 int metaslab_debug_unload = 0;
 
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 int metaslab_df_free_pct = 4;
 
 /*
  * A metaslab is considered "free" if it contains a contiguous
  * segment which is greater than metaslab_min_alloc_size.
  */
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 
 /*
  * Percentage of all cpus that can be used by the metaslab taskq.
  */
 int metaslab_load_pct = 50;
 
 /*
  * Determines how many txgs a metaslab may remain loaded without having any
  * allocations from it. As long as a metaslab continues to be used we will
  * keep it loaded.
  */
 int metaslab_unload_delay = TXG_SIZE * 2;
 
 /*
  * Max number of metaslabs per group to preload.
  */
 int metaslab_preload_limit = SPA_DVAS_PER_BP;
 
 /*
  * Enable/disable preloading of metaslab.
  */
 boolean_t metaslab_preload_enabled = B_TRUE;
 
 /*
  * Enable/disable fragmentation weighting on metaslabs.
  */
 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
 
 /*
  * Enable/disable lba weighting (i.e. outer tracks are given preference).
  */
 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
 
 /*
  * Enable/disable metaslab group biasing.
  */
 boolean_t metaslab_bias_enabled = B_TRUE;
 
 /*
  * Enable/disable remapping of indirect DVAs to their concrete vdevs.
  */
 boolean_t zfs_remap_blkptr_enable = B_TRUE;
 
 /*
  * Enable/disable segment-based metaslab selection.
  */
 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
 
 /*
  * When using segment-based metaslab selection, we will continue
  * allocating from the active metaslab until we have exhausted
  * zfs_metaslab_switch_threshold of its buckets.
  */
 int zfs_metaslab_switch_threshold = 2;
 
 /*
  * Internal switch to enable/disable the metaslab allocation tracing
  * facility.
  */
 boolean_t metaslab_trace_enabled = B_TRUE;
 
 /*
  * Maximum entries that the metaslab allocation tracing facility will keep
  * in a given list when running in non-debug mode. We limit the number
  * of entries in non-debug mode to prevent us from using up too much memory.
  * The limit should be sufficiently large that we don't expect any allocation
  * to every exceed this value. In debug mode, the system will panic if this
  * limit is ever reached allowing for further investigation.
  */
 uint64_t metaslab_trace_max_entries = 5000;
 
 static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
 	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
 	mc->mc_ops = ops;
 	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (zfs_refcount_t), KM_SLEEP);
 	mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
 	    sizeof (uint64_t), KM_SLEEP);
 	for (int i = 0; i < spa->spa_alloc_count; i++)
 		zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
 
 	return (mc);
 }
 
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
 	ASSERT(mc->mc_rotor == NULL);
 	ASSERT(mc->mc_alloc == 0);
 	ASSERT(mc->mc_deferred == 0);
 	ASSERT(mc->mc_space == 0);
 	ASSERT(mc->mc_dspace == 0);
 
 	for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
 		zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
 	kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
 	    sizeof (zfs_refcount_t));
 	kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
 	    sizeof (uint64_t));
 	mutex_destroy(&mc->mc_lock);
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
 int
 metaslab_class_validate(metaslab_class_t *mc)
 {
 	metaslab_group_t *mg;
 	vdev_t *vd;
 
 	/*
 	 * Must hold one of the spa_config locks.
 	 */
 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
 	if ((mg = mc->mc_rotor) == NULL)
 		return (0);
 
 	do {
 		vd = mg->mg_vd;
 		ASSERT(vd->vdev_mg != NULL);
 		ASSERT3P(vd->vdev_top, ==, vd);
 		ASSERT3P(mg->mg_class, ==, mc);
 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
 	} while ((mg = mg->mg_next) != mc->mc_rotor);
 
 	return (0);
 }
 
 static void
 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
 	atomic_add_64(&mc->mc_alloc, alloc_delta);
 	atomic_add_64(&mc->mc_deferred, defer_delta);
 	atomic_add_64(&mc->mc_space, space_delta);
 	atomic_add_64(&mc->mc_dspace, dspace_delta);
 }
 
 uint64_t
 metaslab_class_get_alloc(metaslab_class_t *mc)
 {
 	return (mc->mc_alloc);
 }
 
 uint64_t
 metaslab_class_get_deferred(metaslab_class_t *mc)
 {
 	return (mc->mc_deferred);
 }
 
 uint64_t
 metaslab_class_get_space(metaslab_class_t *mc)
 {
 	return (mc->mc_space);
 }
 
 uint64_t
 metaslab_class_get_dspace(metaslab_class_t *mc)
 {
 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 void
 metaslab_class_histogram_verify(metaslab_class_t *mc)
 {
 	spa_t *spa = mc->mc_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *mc_hist;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels, or
 		 * vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			mc_hist[i] += mg->mg_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
 
 	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 /*
  * Calculate the metaslab class's fragmentation metric. The metric
  * is weighted based on the space contribution of each metaslab group.
  * The return value will be a number between 0 and 100 (inclusive), or
  * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
  * zfs_frag_table for more information about the metric.
  */
 uint64_t
 metaslab_class_fragmentation(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t fragmentation = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		/*
 		 * Skip any holes, uninitialized top-levels,
 		 * or vdevs that are not in this metalab class.
 		 */
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * If a metaslab group does not contain a fragmentation
 		 * metric then just bail out.
 		 */
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 			return (ZFS_FRAG_INVALID);
 		}
 
 		/*
 		 * Determine how much this metaslab_group is contributing
 		 * to the overall pool fragmentation metric.
 		 */
 		fragmentation += mg->mg_fragmentation *
 		    metaslab_group_get_space(mg);
 	}
 	fragmentation /= metaslab_class_get_space(mc);
 
 	ASSERT3U(fragmentation, <=, 100);
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (fragmentation);
 }
 
 /*
  * Calculate the amount of expandable space that is available in
  * this metaslab class. If a device is expanded then its expandable
  * space will be the amount of allocatable space that is currently not
  * part of this metaslab class.
  */
 uint64_t
 metaslab_class_expandable_space(metaslab_class_t *mc)
 {
 	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
 	uint64_t space = 0;
 
 	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		uint64_t tspace;
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
 		    mg->mg_class != mc) {
 			continue;
 		}
 
 		/*
 		 * Calculate if we have enough space to add additional
 		 * metaslabs. We report the expandable space in terms
 		 * of the metaslab size since that's the unit of expansion.
 		 * Adjust by efi system partition size.
 		 */
 		tspace = tvd->vdev_max_asize - tvd->vdev_asize;
 		if (tspace > mc->mc_spa->spa_bootsize) {
 			tspace -= mc->mc_spa->spa_bootsize;
 		}
 		space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
 	}
 	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
 	return (space);
 }
 
 static int
 metaslab_compare(const void *x1, const void *x2)
 {
 	const metaslab_t *m1 = (const metaslab_t *)x1;
 	const metaslab_t *m2 = (const metaslab_t *)x2;
 
 	int sort1 = 0;
 	int sort2 = 0;
 	if (m1->ms_allocator != -1 && m1->ms_primary)
 		sort1 = 1;
 	else if (m1->ms_allocator != -1 && !m1->ms_primary)
 		sort1 = 2;
 	if (m2->ms_allocator != -1 && m2->ms_primary)
 		sort2 = 1;
 	else if (m2->ms_allocator != -1 && !m2->ms_primary)
 		sort2 = 2;
 
 	/*
 	 * Sort inactive metaslabs first, then primaries, then secondaries. When
 	 * selecting a metaslab to allocate from, an allocator first tries its
 	 * primary, then secondary active metaslab. If it doesn't have active
 	 * metaslabs, or can't allocate from them, it searches for an inactive
 	 * metaslab to activate. If it can't find a suitable one, it will steal
 	 * a primary or secondary metaslab from another allocator.
 	 */
 	if (sort1 < sort2)
 		return (-1);
 	if (sort1 > sort2)
 		return (1);
 
 	int cmp = AVL_CMP(m2->ms_weight, m1->ms_weight);
 	if (likely(cmp))
 		return (cmp);
 
 	IMPLY(AVL_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
 
 	return (AVL_CMP(m1->ms_start, m2->ms_start));
 }
 
 uint64_t
 metaslab_allocated_space(metaslab_t *msp)
 {
 	return (msp->ms_allocated_space);
 }
 
 /*
  * Verify that the space accounting on disk matches the in-core range_trees.
  */
 static void
 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t allocating = 0;
 	uint64_t sm_free_space, msp_free_space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!msp->ms_condensing);
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/*
 	 * We can only verify the metaslab space when we're called
 	 * from syncing context with a loaded metaslab that has an
 	 * allocated space map. Calling this in non-syncing context
 	 * does not provide a consistent view of the metaslab since
 	 * we're performing allocations in the future.
 	 */
 	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
 	    !msp->ms_loaded)
 		return;
 
 	/*
 	 * Even though the smp_alloc field can get negative (e.g.
 	 * see vdev_checkpoint_sm), that should never be the case
 	 * when it come's to a metaslab's space map.
 	 */
 	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
 
 	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
 
 	/*
 	 * Account for future allocations since we would have
 	 * already deducted that space from the ms_allocatable.
 	 */
 	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
 		allocating +=
 		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
 	}
 
 	ASSERT3U(msp->ms_deferspace, ==,
 	    range_tree_space(msp->ms_defer[0]) +
 	    range_tree_space(msp->ms_defer[1]));
 
 	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
 	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
 
 	VERIFY3U(sm_free_space, ==, msp_free_space);
 }
 
 /*
  * ==========================================================================
  * Metaslab groups
  * ==========================================================================
  */
 /*
  * Update the allocatable flag and the metaslab group's capacity.
  * The allocatable flag is set to true if the capacity is below
  * the zfs_mg_noalloc_threshold or has a fragmentation value that is
  * greater than zfs_mg_fragmentation_threshold. If a metaslab group
  * transitions from allocatable to non-allocatable or vice versa then the
  * metaslab group's class is updated to reflect the transition.
  */
 static void
 metaslab_group_alloc_update(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	metaslab_class_t *mc = mg->mg_class;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	boolean_t was_allocatable;
 	boolean_t was_initialized;
 
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
 	    SCL_ALLOC);
 
 	mutex_enter(&mg->mg_lock);
 	was_allocatable = mg->mg_allocatable;
 	was_initialized = mg->mg_initialized;
 
 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
 	    (vs->vs_space + 1);
 
 	mutex_enter(&mc->mc_lock);
 
 	/*
 	 * If the metaslab group was just added then it won't
 	 * have any space until we finish syncing out this txg.
 	 * At that point we will consider it initialized and available
 	 * for allocations.  We also don't consider non-activated
 	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
 	 * to be initialized, because they can't be used for allocation.
 	 */
 	mg->mg_initialized = metaslab_group_initialized(mg);
 	if (!was_initialized && mg->mg_initialized) {
 		mc->mc_groups++;
 	} else if (was_initialized && !mg->mg_initialized) {
 		ASSERT3U(mc->mc_groups, >, 0);
 		mc->mc_groups--;
 	}
 	if (mg->mg_initialized)
 		mg->mg_no_free_space = B_FALSE;
 
 	/*
 	 * A metaslab group is considered allocatable if it has plenty
 	 * of free space or is not heavily fragmented. We only take
 	 * fragmentation into account if the metaslab group has a valid
 	 * fragmentation metric (i.e. a value between 0 and 100).
 	 */
 	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
 	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
 	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
 	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
 
 	/*
 	 * The mc_alloc_groups maintains a count of the number of
 	 * groups in this metaslab class that are still above the
 	 * zfs_mg_noalloc_threshold. This is used by the allocating
 	 * threads to determine if they should avoid allocations to
 	 * a given group. The allocator will avoid allocations to a group
 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
 	 * and there are still other groups that are above the threshold.
 	 * When a group transitions from allocatable to non-allocatable or
 	 * vice versa we update the metaslab class to reflect that change.
 	 * When the mc_alloc_groups value drops to 0 that means that all
 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
 	 * eligible for allocations. This effectively means that all devices
 	 * are balanced again.
 	 */
 	if (was_allocatable && !mg->mg_allocatable)
 		mc->mc_alloc_groups--;
 	else if (!was_allocatable && mg->mg_allocatable)
 		mc->mc_alloc_groups++;
 	mutex_exit(&mc->mc_lock);
 
 	mutex_exit(&mg->mg_lock);
 }
 
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
 	metaslab_group_t *mg;
 
 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
 	mg->mg_vd = vd;
 	mg->mg_class = mc;
 	mg->mg_activation_count = 0;
 	mg->mg_initialized = B_FALSE;
 	mg->mg_no_free_space = B_TRUE;
 	mg->mg_allocators = allocators;
 
 	mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
 	    sizeof (zfs_refcount_t), KM_SLEEP);
 	mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
 	    sizeof (uint64_t), KM_SLEEP);
 	for (int i = 0; i < allocators; i++) {
 		zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
 	}
 
 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 
 	return (mg);
 }
 
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	/*
 	 * We may have gone below zero with the activation count
 	 * either because we never activated in the first place or
 	 * because we're done, and possibly removing the vdev.
 	 */
 	ASSERT(mg->mg_activation_count <= 0);
 
 	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
 	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_initialize_lock);
 	cv_destroy(&mg->mg_ms_initialize_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
 		mg->mg_cur_max_alloc_queue_depth[i] = 0;
 	}
 	kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
 	    sizeof (zfs_refcount_t));
 	kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
 	    sizeof (uint64_t));
 
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
 void
 metaslab_group_activate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	metaslab_group_t *mgprev, *mgnext;
 
 	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
 
 	ASSERT(mc->mc_rotor != mg);
 	ASSERT(mg->mg_prev == NULL);
 	ASSERT(mg->mg_next == NULL);
 	ASSERT(mg->mg_activation_count <= 0);
 
 	if (++mg->mg_activation_count <= 0)
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
 	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_rotor) == NULL) {
 		mg->mg_prev = mg;
 		mg->mg_next = mg;
 	} else {
 		mgnext = mgprev->mg_next;
 		mg->mg_prev = mgprev;
 		mg->mg_next = mgnext;
 		mgprev->mg_next = mg;
 		mgnext->mg_prev = mg;
 	}
 	mc->mc_rotor = mg;
 }
 
 /*
  * Passivate a metaslab group and remove it from the allocation rotor.
  * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
  * a metaslab group. This function will momentarily drop spa_config_locks
  * that are lower than the SCL_ALLOC lock (see comment below).
  */
 void
 metaslab_group_passivate(metaslab_group_t *mg)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	spa_t *spa = mc->mc_spa;
 	metaslab_group_t *mgprev, *mgnext;
 	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
 
 	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
 	    (SCL_ALLOC | SCL_ZIO));
 
 	if (--mg->mg_activation_count != 0) {
 		ASSERT(mc->mc_rotor != mg);
 		ASSERT(mg->mg_prev == NULL);
 		ASSERT(mg->mg_next == NULL);
 		ASSERT(mg->mg_activation_count < 0);
 		return;
 	}
 
 	/*
 	 * The spa_config_lock is an array of rwlocks, ordered as
 	 * follows (from highest to lowest):
 	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
 	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
 	 * (For more information about the spa_config_lock see spa_misc.c)
 	 * The higher the lock, the broader its coverage. When we passivate
 	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
 	 * config locks. However, the metaslab group's taskq might be trying
 	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
 	 * lower locks to allow the I/O to complete. At a minimum,
 	 * we continue to hold the SCL_ALLOC lock, which prevents any future
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
 	taskq_wait(mg->mg_taskq);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		metaslab_t *msp = mg->mg_primaries[i];
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 		msp = mg->mg_secondaries[i];
 		if (msp != NULL) {
 			mutex_enter(&msp->ms_lock);
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	mgprev = mg->mg_prev;
 	mgnext = mg->mg_next;
 
 	if (mg == mgnext) {
 		mc->mc_rotor = NULL;
 	} else {
 		mc->mc_rotor = mgnext;
 		mgprev->mg_next = mgnext;
 		mgnext->mg_prev = mgprev;
 	}
 
 	mg->mg_prev = NULL;
 	mg->mg_next = NULL;
 }
 
 boolean_t
 metaslab_group_initialized(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
 }
 
 uint64_t
 metaslab_group_get_space(metaslab_group_t *mg)
 {
 	return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
 }
 
 void
 metaslab_group_histogram_verify(metaslab_group_t *mg)
 {
 	uint64_t *mg_hist;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t ashift = vd->vdev_ashift;
 	int i;
 
 	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
 		return;
 
 	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
 	    KM_SLEEP);
 
 	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
 	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 		ASSERT(msp != NULL);
 
 		/* skip if not active or not a member */
 		if (msp->ms_sm == NULL || msp->ms_group != mg)
 			continue;
 
 		for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 			mg_hist[i + ashift] +=
 			    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 
 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
 		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
 
 	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
 }
 
 static void
 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		mg->mg_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] +=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 void
 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	metaslab_class_t *mc = mg->mg_class;
 	uint64_t ashift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_sm == NULL)
 		return;
 
 	mutex_enter(&mg->mg_lock);
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(mg->mg_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 		ASSERT3U(mc->mc_histogram[i + ashift], >=,
 		    msp->ms_sm->sm_phys->smp_histogram[i]);
 
 		mg->mg_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 		mc->mc_histogram[i + ashift] -=
 		    msp->ms_sm->sm_phys->smp_histogram[i];
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
 	ASSERT(msp->ms_group == NULL);
 	mutex_enter(&mg->mg_lock);
 	msp->ms_group = mg;
 	msp->ms_weight = 0;
 	avl_add(&mg->mg_metaslab_tree, msp);
 	mutex_exit(&mg->mg_lock);
 
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_add(mg, msp);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
 {
 	mutex_enter(&msp->ms_lock);
 	metaslab_group_histogram_remove(mg, msp);
 	mutex_exit(&msp->ms_lock);
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_group = NULL;
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_lock));
 	ASSERT(msp->ms_group == mg);
 	avl_remove(&mg->mg_metaslab_tree, msp);
 	msp->ms_weight = weight;
 	avl_add(&mg->mg_metaslab_tree, msp);
 
 }
 
 static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
 	/*
 	 * Although in principle the weight can be any value, in
 	 * practice we do not use values in the range [1, 511].
 	 */
 	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	mutex_enter(&mg->mg_lock);
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Calculate the fragmentation for a given metaslab group. We can use
  * a simple average here since all metaslabs within the group must have
  * the same size. The return value will be a value between 0 and 100
  * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
  * group have a fragmentation metric.
  */
 uint64_t
 metaslab_group_fragmentation(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
 	uint64_t fragmentation = 0;
 	uint64_t valid_ms = 0;
 
 	for (int m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 
 		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
 			continue;
 		if (msp->ms_group != mg)
 			continue;
 
 		valid_ms++;
 		fragmentation += msp->ms_fragmentation;
 	}
 
 	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
 		return (ZFS_FRAG_INVALID);
 
 	fragmentation /= valid_ms;
 	ASSERT3U(fragmentation, <=, 100);
 	return (fragmentation);
 }
 
 /*
  * Determine if a given metaslab group should skip allocations. A metaslab
  * group should avoid allocations if its free capacity is less than the
  * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
  * zfs_mg_fragmentation_threshold and there is at least one metaslab group
  * that can still handle allocations. If the allocation throttle is enabled
  * then we skip allocations to devices that have reached their maximum
  * allocation queue depth unless the selected metaslab group is the only
  * eligible group remaining.
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
     uint64_t psize, int allocator)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_class_t *mc = mg->mg_class;
 
 	/*
 	 * We can only consider skipping this metaslab group if it's
 	 * in the normal metaslab class and there are other metaslab
 	 * groups to select from. Otherwise, we always consider it eligible
 	 * for allocations.
 	 */
 	if ((mc != spa_normal_class(spa) &&
 	    mc != spa_special_class(spa) &&
 	    mc != spa_dedup_class(spa)) ||
 	    mc->mc_groups <= 1)
 		return (B_TRUE);
 
 	/*
 	 * If the metaslab group's mg_allocatable flag is set (see comments
 	 * in metaslab_group_alloc_update() for more information) and
 	 * the allocation throttle is disabled then allow allocations to this
 	 * device. However, if the allocation throttle is enabled then
 	 * check if we have reached our allocation limit (mg_alloc_queue_depth)
 	 * to determine if we should allow allocations to this metaslab group.
 	 * If all metaslab groups are no longer considered allocatable
 	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
 	 * gang block size then we allow allocations on this metaslab group
 	 * regardless of the mg_allocatable or throttle settings.
 	 */
 	if (mg->mg_allocatable) {
 		metaslab_group_t *mgp;
 		int64_t qdepth;
 		uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
 
 		if (!mc->mc_alloc_throttle_enabled)
 			return (B_TRUE);
 
 		/*
 		 * If this metaslab group does not have any free space, then
 		 * there is no point in looking further.
 		 */
 		if (mg->mg_no_free_space)
 			return (B_FALSE);
 
 		qdepth = zfs_refcount_count(
 		    &mg->mg_alloc_queue_depth[allocator]);
 
 		/*
 		 * If this metaslab group is below its qmax or it's
 		 * the only allocatable metasable group, then attempt
 		 * to allocate from it.
 		 */
 		if (qdepth < qmax || mc->mc_alloc_groups == 1)
 			return (B_TRUE);
 		ASSERT3U(mc->mc_alloc_groups, >, 1);
 
 		/*
 		 * Since this metaslab group is at or over its qmax, we
 		 * need to determine if there are metaslab groups after this
 		 * one that might be able to handle this allocation. This is
 		 * racy since we can't hold the locks for all metaslab
 		 * groups at the same time when we make this check.
 		 */
 		for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
 			qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
 
 			qdepth = zfs_refcount_count(
 			    &mgp->mg_alloc_queue_depth[allocator]);
 
 			/*
 			 * If there is another metaslab group that
 			 * might be able to handle the allocation, then
 			 * we return false so that we skip this group.
 			 */
 			if (qdepth < qmax && !mgp->mg_no_free_space)
 				return (B_FALSE);
 		}
 
 		/*
 		 * We didn't find another group to handle the allocation
 		 * so we can't skip this metaslab group even though
 		 * we are at or over our qmax.
 		 */
 		return (B_TRUE);
 
 	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * ==========================================================================
  * Range tree callbacks
  * ==========================================================================
  */
 
 /*
  * Comparison function for the private size-ordered tree. Tree is sorted
  * by size, larger sizes at the end of the tree.
  */
 static int
 metaslab_rangesize_compare(const void *x1, const void *x2)
 {
 	const range_seg_t *r1 = x1;
 	const range_seg_t *r2 = x2;
 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
 	int cmp = AVL_CMP(rs_size1, rs_size2);
 	if (likely(cmp))
 		return (cmp);
 
 	return (AVL_CMP(r1->rs_start, r2->rs_start));
 }
 
 /*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
  */
 
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
 metaslab_block_maxsize(metaslab_t *msp)
 {
 	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	range_seg_t *rs;
 
 	if (t == NULL || (rs = avl_last(t)) == NULL)
 		return (0ULL);
 
 	return (rs->rs_end - rs->rs_start);
 }
 
 static range_seg_t *
 metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size)
 {
 	range_seg_t *rs, rsearch;
 	avl_index_t where;
 
 	rsearch.rs_start = start;
 	rsearch.rs_end = start + size;
 
 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL) {
 		rs = avl_nearest(t, where, AVL_AFTER);
 	}
 
 	return (rs);
 }
 
 /*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
  * tree looking for a block that matches the specified criteria.
  */
 static uint64_t
 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
     uint64_t align)
 {
 	range_seg_t *rs = metaslab_block_find(t, *cursor, size);
 
 	while (rs != NULL) {
 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
 
 		if (offset + size <= rs->rs_end) {
 			*cursor = offset + size;
 			return (offset);
 		}
 		rs = AVL_NEXT(t, rs);
 	}
 
 	/*
 	 * If we know we've searched the whole map (*cursor == 0), give up.
 	 * Otherwise, reset the cursor to the beginning and try again.
 	 */
 	if (*cursor == 0)
 		return (-1ULL);
 
 	*cursor = 0;
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /*
  * ==========================================================================
  * The first-fit block allocator
  * ==========================================================================
  */
 static uint64_t
 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	avl_tree_t *t = &msp->ms_allocatable->rt_root;
 
 	return (metaslab_block_picker(t, cursor, size, align));
 }
 
 static metaslab_ops_t metaslab_ff_ops = {
 	metaslab_ff_alloc
 };
 
 /*
  * ==========================================================================
  * Dynamic block allocator -
  * Uses the first fit allocation scheme until space get low and then
  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
  * ==========================================================================
  */
 static uint64_t
 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
 	/*
 	 * Find the largest power of 2 block size that evenly divides the
 	 * requested size. This is used to try to allocate blocks with similar
 	 * alignment from the same area of the metaslab (i.e. same cursor
 	 * bucket) but it does not guarantee that other allocations sizes
 	 * may exist in the same region.
 	 */
 	uint64_t align = size & -size;
 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
 	range_tree_t *rt = msp->ms_allocatable;
 	avl_tree_t *t = &rt->rt_root;
 	uint64_t max_size = metaslab_block_maxsize(msp);
 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==,
 	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	/*
 	 * If we're running low on space switch to using the size
 	 * sorted AVL tree (best-fit).
 	 */
 	if (max_size < metaslab_df_alloc_threshold ||
 	    free_pct < metaslab_df_free_pct) {
 		t = &msp->ms_allocatable_by_size;
 		*cursor = 0;
 	}
 
 	return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
 static metaslab_ops_t metaslab_df_ops = {
 	metaslab_df_alloc
 };
 
 /*
  * ==========================================================================
  * Cursor fit block allocator -
  * Select the largest region in the metaslab, set the cursor to the beginning
  * of the range and the cursor_end to the end of the range. As allocations
  * are made advance the cursor. Continue allocating from the cursor until
  * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
 	range_tree_t *rt = msp->ms_allocatable;
 	avl_tree_t *t = &msp->ms_allocatable_by_size;
 	uint64_t *cursor = &msp->ms_lbas[0];
 	uint64_t *cursor_end = &msp->ms_lbas[1];
 	uint64_t offset = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
 	ASSERT3U(*cursor_end, >=, *cursor);
 
 	if ((*cursor + size) > *cursor_end) {
 		range_seg_t *rs;
 
 		rs = avl_last(&msp->ms_allocatable_by_size);
 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
 			return (-1ULL);
 
 		*cursor = rs->rs_start;
 		*cursor_end = rs->rs_end;
 	}
 
 	offset = *cursor;
 	*cursor += size;
 
 	return (offset);
 }
 
 static metaslab_ops_t metaslab_cf_ops = {
 	metaslab_cf_alloc
 };
 
 /*
  * ==========================================================================
  * New dynamic fit allocator -
  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
  * contiguous blocks. If no region is found then just use the largest segment
  * that remains.
  * ==========================================================================
  */
 
 /*
  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
  * to request from the allocator.
  */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
 	avl_tree_t *t = &msp->ms_allocatable->rt_root;
 	avl_index_t where;
 	range_seg_t *rs, rsearch;
 	uint64_t hbit = highbit64(size);
 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
 	uint64_t max_size = metaslab_block_maxsize(msp);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT3U(avl_numnodes(t), ==,
 	    avl_numnodes(&msp->ms_allocatable_by_size));
 
 	if (max_size < size)
 		return (-1ULL);
 
 	rsearch.rs_start = *cursor;
 	rsearch.rs_end = *cursor + size;
 
 	rs = avl_find(t, &rsearch, &where);
 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
 		t = &msp->ms_allocatable_by_size;
 
 		rsearch.rs_start = 0;
 		rsearch.rs_end = MIN(max_size,
 		    1ULL << (hbit + metaslab_ndf_clump_shift));
 		rs = avl_find(t, &rsearch, &where);
 		if (rs == NULL)
 			rs = avl_nearest(t, where, AVL_AFTER);
 		ASSERT(rs != NULL);
 	}
 
 	if ((rs->rs_end - rs->rs_start) >= size) {
 		*cursor = rs->rs_start + size;
 		return (rs->rs_start);
 	}
 	return (-1ULL);
 }
 
 static metaslab_ops_t metaslab_ndf_ops = {
 	metaslab_ndf_alloc
 };
 
 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
 
 static void
 metaslab_aux_histograms_clear(metaslab_t *msp)
 {
 	/*
 	 * Auxiliary histograms are only cleared when resetting them,
 	 * which can only happen while the metaslab is loaded.
 	 */
 	ASSERT(msp->ms_loaded);
 
 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
 		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
 }
 
 static void
 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
     range_tree_t *rt)
 {
 	/*
 	 * This is modeled after space_map_histogram_add(), so refer to that
 	 * function for implementation details. We want this to work like
 	 * the space map histogram, and not the range tree histogram, as we
 	 * are essentially constructing a delta that will be later subtracted
 	 * from the space map histogram.
 	 */
 	int idx = 0;
 	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
 		ASSERT3U(i, >=, idx + shift);
 		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
 
 		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
 			ASSERT3U(idx + shift, ==, i);
 			idx++;
 			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
 		}
 	}
 }
 
 /*
  * Called at every sync pass that the metaslab gets synced.
  *
  * The reason is that we want our auxiliary histograms to be updated
  * wherever the metaslab's space map histogram is updated. This way
  * we stay consistent on which parts of the metaslab space map's
  * histogram are currently not available for allocations (e.g because
  * they are in the defer, freed, and freeing trees).
  */
 static void
 metaslab_aux_histograms_update(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(sm != NULL);
 
 	/*
 	 * This is similar to the metaslab's space map histogram updates
 	 * that take place in metaslab_sync(). The only difference is that
 	 * we only care about segments that haven't made it into the
 	 * ms_allocatable tree yet.
 	 */
 	if (msp->ms_loaded) {
 		metaslab_aux_histograms_clear(msp);
 
 		metaslab_aux_histogram_add(msp->ms_synchist,
 		    sm->sm_shift, msp->ms_freed);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			metaslab_aux_histogram_add(msp->ms_deferhist[t],
 			    sm->sm_shift, msp->ms_defer[t]);
 		}
 	}
 
 	metaslab_aux_histogram_add(msp->ms_synchist,
 	    sm->sm_shift, msp->ms_freeing);
 }
 
 /*
  * Called every time we are done syncing (writing to) the metaslab,
  * i.e. at the end of each sync pass.
  * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
  */
 static void
 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 
 	if (sm == NULL) {
 		/*
 		 * We came here from metaslab_init() when creating/opening a
 		 * pool, looking at a metaslab that hasn't had any allocations
 		 * yet.
 		 */
 		return;
 	}
 
 	/*
 	 * This is similar to the actions that we take for the ms_freed
 	 * and ms_defer trees in metaslab_sync_done().
 	 */
 	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
 	if (defer_allowed) {
 		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
 		    sizeof (msp->ms_synchist));
 	} else {
 		bzero(msp->ms_deferhist[hist_index],
 		    sizeof (msp->ms_deferhist[hist_index]));
 	}
 	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
 }
 
 /*
  * Ensure that the metaslab's weight and fragmentation are consistent
  * with the contents of the histogram (either the range tree's histogram
  * or the space map's depending whether the metaslab is loaded).
  */
 static void
 metaslab_verify_weight_and_frag(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
 		return;
 
 	/* see comment in metaslab_verify_unflushed_changes() */
 	if (msp->ms_group == NULL)
 		return;
 
 	/*
 	 * Devices being removed always return a weight of 0 and leave
 	 * fragmentation and ms_max_size as is - there is nothing for
 	 * us to verify here.
 	 */
 	vdev_t *vd = msp->ms_group->mg_vd;
 	if (vd->vdev_removing)
 		return;
 
 	/*
 	 * If the metaslab is dirty it probably means that we've done
 	 * some allocations or frees that have changed our histograms
 	 * and thus the weight.
 	 */
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&vd->vdev_ms_list, msp, t))
 			return;
 	}
 
 	/*
 	 * This verification checks that our in-memory state is consistent
 	 * with what's on disk. If the pool is read-only then there aren't
 	 * any changes and we just have the initially-loaded state.
 	 */
 	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
 		return;
 
 	/* some extra verification for in-core tree if you can */
 	if (msp->ms_loaded) {
 		range_tree_stat_verify(msp->ms_allocatable);
 		VERIFY(space_map_histogram_verify(msp->ms_sm,
 		    msp->ms_allocatable));
 	}
 
 	uint64_t weight = msp->ms_weight;
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
 	uint64_t frag = msp->ms_fragmentation;
 	uint64_t max_segsize = msp->ms_max_size;
 
 	msp->ms_weight = 0;
 	msp->ms_fragmentation = 0;
 	msp->ms_max_size = 0;
 
 	/*
 	 * This function is used for verification purposes. Regardless of
 	 * whether metaslab_weight() thinks this metaslab should be active or
 	 * not, we want to ensure that the actual weight (and therefore the
 	 * value of ms_weight) would be the same if it was to be recalculated
 	 * at this point.
 	 */
 	msp->ms_weight = metaslab_weight(msp) | was_active;
 
 	VERIFY3U(max_segsize, ==, msp->ms_max_size);
 
 	/*
 	 * If the weight type changed then there is no point in doing
 	 * verification. Revert fields to their original values.
 	 */
 	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
 	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
 		msp->ms_fragmentation = frag;
 		msp->ms_weight = weight;
 		return;
 	}
 
 	VERIFY3U(msp->ms_fragmentation, ==, frag);
 	VERIFY3U(msp->ms_weight, ==, weight);
 }
 
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
 static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	while (msp->ms_loading) {
 		ASSERT(!msp->ms_loaded);
 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
 	}
 }
 
 static int
 metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	/*
 	 * We temporarily drop the lock to unblock other operations while we
 	 * are reading the space map. Therefore, metaslab_sync() and
 	 * metaslab_sync_done() can run at the same time as we do.
 	 *
 	 * metaslab_sync() can append to the space map while we are loading.
 	 * Therefore we load only entries that existed when we started the
 	 * load. Additionally, metaslab_sync_done() has to wait for the load
 	 * to complete because there are potential races like metaslab_load()
 	 * loading parts of the space map that are currently being appended
 	 * by metaslab_sync(). If we didn't, the ms_allocatable would have
 	 * entries that metaslab_sync_done() would try to re-add later.
 	 *
 	 * That's why before dropping the lock we remember the synced length
 	 * of the metaslab and read up to that point of the space map,
 	 * ignoring entries appended by metaslab_sync() that happen after we
 	 * drop the lock.
 	 */
 	uint64_t length = msp->ms_synced_length;
 	mutex_exit(&msp->ms_lock);
 
 	if (msp->ms_sm != NULL) {
 		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
 		    SM_FREE, length);
 	} else {
 		/*
 		 * The space map has not been allocated yet, so treat
 		 * all the space in the metaslab as free and add it to the
 		 * ms_allocatable tree.
 		 */
 		range_tree_add(msp->ms_allocatable,
 		    msp->ms_start, msp->ms_size);
 	}
 
 	/*
 	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
 	 * changing the ms_sm and the metaslab's range trees while we are
 	 * about to use them and populate the ms_allocatable. The ms_lock
 	 * is insufficient for this because metaslab_sync() doesn't hold
 	 * the ms_lock while writing the ms_checkpointing tree to disk.
 	 */
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 	ASSERT(!msp->ms_condensing);
 
 	if (error != 0) {
 		mutex_exit(&msp->ms_sync_lock);
 		return (error);
 	}
 
 	ASSERT3P(msp->ms_group, !=, NULL);
 	msp->ms_loaded = B_TRUE;
 
 	/*
 	 * The ms_allocatable contains the segments that exist in the
 	 * ms_defer trees [see ms_synced_length]. Thus we need to remove
 	 * them from ms_allocatable as they will be added again in
 	 * metaslab_sync_done().
 	 */
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, msp->ms_allocatable);
 	}
 
 	/*
 	 * Call metaslab_recalculate_weight_and_sort() now that the
 	 * metaslab is loaded so we get the metaslab's real weight.
 	 *
 	 * Unless this metaslab was created with older software and
 	 * has not yet been converted to use segment-based weight, we
 	 * expect the new weight to be better or equal to the weight
 	 * that the metaslab had while it was not loaded. This is
 	 * because the old weight does not take into account the
 	 * consolidation of adjacent segments between TXGs. [see
 	 * comment for ms_synchist and ms_deferhist[] for more info]
 	 */
 	uint64_t weight = msp->ms_weight;
 	metaslab_recalculate_weight_and_sort(msp);
 	if (!WEIGHT_IS_SPACEBASED(weight))
 		ASSERT3U(weight, <=, msp->ms_weight);
 	msp->ms_max_size = metaslab_block_maxsize(msp);
 
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	metaslab_verify_space(msp, spa_syncing_txg(spa));
 	mutex_exit(&msp->ms_sync_lock);
 
 	return (0);
 }
 
 int
 metaslab_load(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * There may be another thread loading the same metaslab, if that's
 	 * the case just wait until the other thread is done and return.
 	 */
 	metaslab_load_wait(msp);
 	if (msp->ms_loaded)
 		return (0);
 	VERIFY(!msp->ms_loading);
 	ASSERT(!msp->ms_condensing);
 
 	msp->ms_loading = B_TRUE;
 	int error = metaslab_load_impl(msp);
 	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
 
 	return (error);
 }
 
 void
 metaslab_unload(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	metaslab_verify_weight_and_frag(msp);
 
 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 	msp->ms_loaded = B_FALSE;
 
 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
 	msp->ms_max_size = 0;
 
 	/*
 	 * We explicitly recalculate the metaslab's weight based on its space
 	 * map (as it is now not loaded). We want unload metaslabs to always
 	 * have their weights calculated from the space map histograms, while
 	 * loaded ones have it calculated from their in-core range tree
 	 * [see metaslab_load()]. This way, the weight reflects the information
 	 * available in-core, whether it is loaded or not
 	 *
 	 * If ms_group == NULL means that we came here from metaslab_fini(),
 	 * at which point it doesn't make sense for us to do the recalculation
 	 * and the sorting.
 	 */
 	if (msp->ms_group != NULL)
 		metaslab_recalculate_weight_and_sort(msp);
 }
 
 static void
 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
     int64_t defer_delta, int64_t space_delta)
 {
 	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
 
 	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
 	ASSERT(vd->vdev_ms_count != 0);
 
 	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
 	    vdev_deflated_space(vd, space_delta));
 }
 
 int
 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
     metaslab_t **msp)
 {
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	metaslab_t *ms;
 	int error;
 
 	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
 
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
 	ms->ms_allocator = -1;
 	ms->ms_new = B_TRUE;
 
 	/*
 	 * We only open space map objects that already exist. All others
 	 * will be opened when we finally allocate an object for it.
 	 *
 	 * Note:
 	 * When called from vdev_expand(), we can't call into the DMU as
 	 * we are holding the spa_config_lock as a writer and we would
 	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
 	 * that case, the object parameter is zero though, so we won't
 	 * call into the DMU.
 	 */
 	if (object != 0) {
 		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
 		    ms->ms_size, vd->vdev_ashift);
 
 		if (error != 0) {
 			kmem_free(ms, sizeof (metaslab_t));
 			return (error);
 		}
 
 		ASSERT(ms->ms_sm != NULL);
 		ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
 		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
 	}
 
 	/*
 	 * We create the ms_allocatable here, but we don't create the
 	 * other range trees until metaslab_sync_done().  This serves
 	 * two purposes: it allows metaslab_sync_done() to detect the
 	 * addition of new space; and for debugging, it ensures that
 	 * we'd data fault on any attempt to use this metaslab before
 	 * it's ready.
 	 */
 	ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops,
 	    &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0);
 	metaslab_group_add(mg, ms);
 
 	metaslab_set_fragmentation(ms);
 
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
 	 * If we're adding space to an existing pool, the new space
 	 * does not become available until after this txg has synced.
 	 * The metaslab's weight will also be initialized when we sync
 	 * out this txg. This ensures that we don't attempt to allocate
 	 * from it before we have initialized it completely.
 	 */
 	if (txg <= TXG_INITIAL) {
 		metaslab_sync_done(ms, 0);
 		metaslab_space_update(vd, mg->mg_class,
 		    metaslab_allocated_space(ms), 0, 0);
 	}
 
 	/*
 	 * If metaslab_debug_load is set and we're initializing a metaslab
 	 * that has an allocated space map object then load the space map
 	 * so that we can verify frees.
 	 */
 	if (metaslab_debug_load && ms->ms_sm != NULL) {
 		mutex_enter(&ms->ms_lock);
 		VERIFY0(metaslab_load(ms));
 		mutex_exit(&ms->ms_lock);
 	}
 
 	if (txg != 0) {
 		vdev_dirty(vd, 0, NULL, txg);
 		vdev_dirty(vd, VDD_METASLAB, ms, txg);
 	}
 
 	*msp = ms;
 
 	return (0);
 }
 
 void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 
 	metaslab_group_remove(mg, msp);
 
 	mutex_enter(&msp->ms_lock);
 	VERIFY(msp->ms_group == NULL);
 	metaslab_space_update(vd, mg->mg_class,
 	    -metaslab_allocated_space(msp), 0, -msp->ms_size);
 
 	space_map_close(msp->ms_sm);
 
 	metaslab_unload(msp);
 
 	range_tree_destroy(msp->ms_allocatable);
 	range_tree_destroy(msp->ms_freeing);
 	range_tree_destroy(msp->ms_freed);
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		range_tree_destroy(msp->ms_allocating[t]);
 	}
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_destroy(msp->ms_defer[t]);
 	}
 	ASSERT0(msp->ms_deferspace);
 
 	range_tree_destroy(msp->ms_checkpointing);
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
 
 	mutex_exit(&msp->ms_lock);
 	cv_destroy(&msp->ms_load_cv);
 	mutex_destroy(&msp->ms_lock);
 	mutex_destroy(&msp->ms_sync_lock);
 	ASSERT3U(msp->ms_allocator, ==, -1);
 
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
 #define	FRAGMENTATION_TABLE_SIZE	17
 
 /*
  * This table defines a segment size based fragmentation metric that will
  * allow each metaslab to derive its own fragmentation value. This is done
  * by calculating the space in each bucket of the spacemap histogram and
  * multiplying that by the fragmentation metric in this table. Doing
  * this for all buckets and dividing it by the total amount of free
  * space in this metaslab (i.e. the total free space in all buckets) gives
  * us the fragmentation metric. This means that a high fragmentation metric
  * equates to most of the free space being comprised of small segments.
  * Conversely, if the metric is low, then most of the free space is in
  * large segments. A 10% change in fragmentation equates to approximately
  * double the number of segments.
  *
  * This table defines 0% fragmented space using 16MB segments. Testing has
  * shown that segments that are greater than or equal to 16MB do not suffer
  * from drastic performance problems. Using this value, we derive the rest
  * of the table. Since the fragmentation value is never stored on disk, it
  * is possible to change these calculations in the future.
  */
 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
 	100,	/* 512B	*/
 	100,	/* 1K	*/
 	98,	/* 2K	*/
 	95,	/* 4K	*/
 	90,	/* 8K	*/
 	80,	/* 16K	*/
 	70,	/* 32K	*/
 	60,	/* 64K	*/
 	50,	/* 128K	*/
 	40,	/* 256K	*/
 	30,	/* 512K	*/
 	20,	/* 1M	*/
 	15,	/* 2M	*/
 	10,	/* 4M	*/
 	5,	/* 8M	*/
 	0	/* 16M	*/
 };
 
 /*
  * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
  * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
  * been upgraded and does not support this metric. Otherwise, the return
  * value should be in the range [0, 100].
  */
 static void
 metaslab_set_fragmentation(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 	uint64_t fragmentation = 0;
 	uint64_t total = 0;
 	boolean_t feature_enabled = spa_feature_is_enabled(spa,
 	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
 
 	if (!feature_enabled) {
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	/*
 	 * A null space map means that the entire metaslab is free
 	 * and thus is not fragmented.
 	 */
 	if (msp->ms_sm == NULL) {
 		msp->ms_fragmentation = 0;
 		return;
 	}
 
 	/*
 	 * If this metaslab's space map has not been upgraded, flag it
 	 * so that we upgrade next time we encounter it.
 	 */
 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
 		uint64_t txg = spa_syncing_txg(spa);
 		vdev_t *vd = msp->ms_group->mg_vd;
 
 		/*
 		 * If we've reached the final dirty txg, then we must
 		 * be shutting down the pool. We don't want to dirty
 		 * any data past this point so skip setting the condense
 		 * flag. We can retry this action the next time the pool
 		 * is imported.
 		 */
 		if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
 			msp->ms_condense_wanted = B_TRUE;
 			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 			zfs_dbgmsg("txg %llu, requesting force condense: "
 			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
 			    vd->vdev_id);
 		}
 		msp->ms_fragmentation = ZFS_FRAG_INVALID;
 		return;
 	}
 
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 		uint64_t space = 0;
 		uint8_t shift = msp->ms_sm->sm_shift;
 
 		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
 		    FRAGMENTATION_TABLE_SIZE - 1);
 
 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
 			continue;
 
 		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
 		total += space;
 
 		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
 		fragmentation += space * zfs_frag_table[idx];
 	}
 
 	if (total > 0)
 		fragmentation /= total;
 	ASSERT3U(fragmentation, <=, 100);
 
 	msp->ms_fragmentation = fragmentation;
 }
 
 /*
  * Compute a weight -- a selection preference value -- for the given metaslab.
  * This is based on the amount of free space, the level of fragmentation,
  * the LBA range, and whether the metaslab is loaded.
  */
 static uint64_t
 metaslab_space_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(!vd->vdev_removing);
 
 	/*
 	 * The baseline weight is the metaslab's free space.
 	 */
 	space = msp->ms_size - metaslab_allocated_space(msp);
 
 	if (metaslab_fragmentation_factor_enabled &&
 	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
 		/*
 		 * Use the fragmentation information to inversely scale
 		 * down the baseline weight. We need to ensure that we
 		 * don't exclude this metaslab completely when it's 100%
 		 * fragmented. To avoid this we reduce the fragmented value
 		 * by 1.
 		 */
 		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
 
 		/*
 		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
 		 * this metaslab again. The fragmentation metric may have
 		 * decreased the space to something smaller than
 		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
 		 * so that we can consume any remaining space.
 		 */
 		if (space > 0 && space < SPA_MINBLOCKSIZE)
 			space = SPA_MINBLOCKSIZE;
 	}
 	weight = space;
 
 	/*
 	 * Modern disks have uniform bit density and constant angular velocity.
 	 * Therefore, the outer recording zones are faster (higher bandwidth)
 	 * than the inner zones by the ratio of outer to inner track diameter,
 	 * which is typically around 2:1.  We account for this by assigning
 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
 	 * In effect, this means that we'll select the metaslab with the most
 	 * free bandwidth rather than simply the one with the most free space.
 	 */
 	if (metaslab_lba_weighting_enabled) {
 		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
 		ASSERT(weight >= space && weight <= 2 * space);
 	}
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its
 	 * weight to make it preferable to any inactive metaslab so
 	 * we'll polish it off. If the fragmentation on this metaslab
 	 * has exceed our threshold, then don't mark it active.
 	 */
 	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
 	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 	}
 
 	WEIGHT_SET_SPACEBASED(weight);
 	return (weight);
 }
 
 /*
  * Return the weight of the specified metaslab, according to the segment-based
  * weighting algorithm. The metaslab must be loaded. This function can
  * be called within a sync pass since it relies only on the metaslab's
  * range tree which is always accurate when the metaslab is loaded.
  */
 static uint64_t
 metaslab_weight_from_range_tree(metaslab_t *msp)
 {
 	uint64_t weight = 0;
 	uint32_t segments = 0;
 
 	ASSERT(msp->ms_loaded);
 
 	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
 	    i--) {
 		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		segments <<= 1;
 		segments += msp->ms_allocatable->rt_histogram[i];
 
 		/*
 		 * The range tree provides more precision than the space map
 		 * and must be downgraded so that all values fit within the
 		 * space map's histogram. This allows us to compare loaded
 		 * vs. unloaded metaslabs to determine which metaslab is
 		 * considered "best".
 		 */
 		if (i > max_idx)
 			continue;
 
 		if (segments != 0) {
 			WEIGHT_SET_COUNT(weight, segments);
 			WEIGHT_SET_INDEX(weight, i);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Calculate the weight based on the on-disk histogram. This should only
  * be called after a sync pass has completely finished since the on-disk
  * information is updated in metaslab_sync().
  */
 static uint64_t
 metaslab_weight_from_spacemap(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	ASSERT(!msp->ms_loaded);
 	ASSERT(sm != NULL);
 	ASSERT3U(space_map_object(sm), !=, 0);
 	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * Create a joint histogram from all the segments that have made
 	 * it to the metaslab's space map histogram, that are not yet
 	 * available for allocation because they are still in the freeing
 	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
 	 * these segments from the space map's histogram to get a more
 	 * accurate weight.
 	 */
 	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
 	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
 		deferspace_histogram[i] += msp->ms_synchist[i];
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
 			deferspace_histogram[i] += msp->ms_deferhist[t][i];
 		}
 	}
 
 	uint64_t weight = 0;
 	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
 		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
 		    deferspace_histogram[i]);
 		uint64_t count =
 		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
 		if (count != 0) {
 			WEIGHT_SET_COUNT(weight, count);
 			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
 			WEIGHT_SET_ACTIVE(weight, 0);
 			break;
 		}
 	}
 	return (weight);
 }
 
 /*
  * Compute a segment-based weight for the specified metaslab. The weight
  * is determined by highest bucket in the histogram. The information
  * for the highest bucket is encoded into the weight value.
  */
 static uint64_t
 metaslab_segment_weight(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	uint64_t weight = 0;
 	uint8_t shift = mg->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * The metaslab is completely free.
 	 */
 	if (metaslab_allocated_space(msp) == 0) {
 		int idx = highbit64(msp->ms_size) - 1;
 		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
 
 		if (idx < max_idx) {
 			WEIGHT_SET_COUNT(weight, 1ULL);
 			WEIGHT_SET_INDEX(weight, idx);
 		} else {
 			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
 			WEIGHT_SET_INDEX(weight, max_idx);
 		}
 		WEIGHT_SET_ACTIVE(weight, 0);
 		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
 
 		return (weight);
 	}
 
 	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
 
 	/*
 	 * If the metaslab is fully allocated then just make the weight 0.
 	 */
 	if (metaslab_allocated_space(msp) == msp->ms_size)
 		return (0);
 	/*
 	 * If the metaslab is already loaded, then use the range tree to
 	 * determine the weight. Otherwise, we rely on the space map information
 	 * to generate the weight.
 	 */
 	if (msp->ms_loaded) {
 		weight = metaslab_weight_from_range_tree(msp);
 	} else {
 		weight = metaslab_weight_from_spacemap(msp);
 	}
 
 	/*
 	 * If the metaslab was active the last time we calculated its weight
 	 * then keep it active. We want to consume the entire region that
 	 * is associated with this weight.
 	 */
 	if (msp->ms_activation_weight != 0 && weight != 0)
 		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
 	return (weight);
 }
 
 /*
  * Determine if we should attempt to allocate from this metaslab. If the
  * metaslab has a maximum size then we can quickly determine if the desired
  * allocation size can be satisfied. Otherwise, if we're using segment-based
  * weighting then we can determine the maximum allocation that this metaslab
  * can accommodate based on the index encoded in the weight. If we're using
  * space-based weights then rely on the entire weight (excluding the weight
  * type bit).
  */
 boolean_t
 metaslab_should_allocate(metaslab_t *msp, uint64_t asize)
 {
 	boolean_t should_allocate;
 
 	if (msp->ms_max_size != 0)
 		return (msp->ms_max_size >= asize);
 
 	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 		/*
 		 * The metaslab segment weight indicates segments in the
 		 * range [2^i, 2^(i+1)), where i is the index in the weight.
 		 * Since the asize might be in the middle of the range, we
 		 * should attempt the allocation if asize < 2^(i+1).
 		 */
 		should_allocate = (asize <
 		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
 	} else {
 		should_allocate = (asize <=
 		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
 	}
 	return (should_allocate);
 }
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	uint64_t weight;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	/*
 	 * If this vdev is in the process of being removed, there is nothing
 	 * for us to do here.
 	 */
 	if (vd->vdev_removing)
 		return (0);
 
 	metaslab_set_fragmentation(msp);
 
 	/*
 	 * Update the maximum size if the metaslab is loaded. This will
 	 * ensure that we get an accurate maximum size if newly freed space
 	 * has been added back into the free tree.
 	 */
 	if (msp->ms_loaded)
 		msp->ms_max_size = metaslab_block_maxsize(msp);
 	else
 		ASSERT0(msp->ms_max_size);
 
 	/*
 	 * Segment-based weighting requires space map histogram support.
 	 */
 	if (zfs_metaslab_segment_weight_enabled &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
 	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
 	    sizeof (space_map_phys_t))) {
 		weight = metaslab_segment_weight(msp);
 	} else {
 		weight = metaslab_space_weight(msp);
 	}
 	return (weight);
 }
 
 void
 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
 {
 	/* note: we preserve the mask (e.g. indication of primary, etc..) */
 	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 	metaslab_group_sort(msp->ms_group, msp,
 	    metaslab_weight(msp) | was_active);
 }
 
 static int
 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     int allocator, uint64_t activation_weight)
 {
 	/*
 	 * If we're activating for the claim code, we don't want to actually
 	 * set the metaslab up for a specific allocator.
 	 */
 	if (activation_weight == METASLAB_WEIGHT_CLAIM)
 		return (0);
 	metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
 	    mg->mg_primaries : mg->mg_secondaries);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	mutex_enter(&mg->mg_lock);
 	if (arr[allocator] != NULL) {
 		mutex_exit(&mg->mg_lock);
 		return (EEXIST);
 	}
 
 	arr[allocator] = msp;
 	ASSERT3S(msp->ms_allocator, ==, -1);
 	msp->ms_allocator = allocator;
 	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
 	mutex_exit(&mg->mg_lock);
 
 	return (0);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int error = metaslab_load(msp);
 		if (error != 0) {
 			metaslab_group_sort(msp->ms_group, msp, 0);
 			return (error);
 		}
 		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 			/*
 			 * The metaslab was activated for another allocator
 			 * while we were waiting, we should reselect.
 			 */
 			return (EBUSY);
 		}
 		if ((error = metaslab_activate_allocator(msp->ms_group, msp,
 		    allocator, activation_weight)) != 0) {
 			return (error);
 		}
 
 		msp->ms_activation_weight = msp->ms_weight;
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
 	ASSERT(msp->ms_loaded);
 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
 
 static void
 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
     uint64_t weight)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
 		metaslab_group_sort(mg, msp, weight);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 	ASSERT3P(msp->ms_group, ==, mg);
 	if (msp->ms_primary) {
 		ASSERT3U(0, <=, msp->ms_allocator);
 		ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
 		ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
 		mg->mg_primaries[msp->ms_allocator] = NULL;
 	} else {
 		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
 		ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
 		mg->mg_secondaries[msp->ms_allocator] = NULL;
 	}
 	msp->ms_allocator = -1;
 	metaslab_group_sort_impl(mg, msp, weight);
 	mutex_exit(&mg->mg_lock);
 }
 
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
 	uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
 
 	/*
 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
 	 * this metaslab again.  In that case, it had better be empty,
 	 * or we would be leaving space on the table.
 	 */
 	ASSERT(size >= SPA_MINBLOCKSIZE ||
 	    range_tree_is_empty(msp->ms_allocatable));
 	ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
 	msp->ms_activation_weight = 0;
 	metaslab_passivate_allocator(msp->ms_group, msp, weight);
 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
  * Segment-based metaslabs are activated once and remain active until
  * we either fail an allocation attempt (similar to space-based metaslabs)
  * or have exhausted the free space in zfs_metaslab_switch_threshold
  * buckets since the metaslab was activated. This function checks to see
  * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
  * metaslab and passivates it proactively. This will allow us to select a
  * metaslabs with larger contiguous region if any remaining within this
  * metaslab group. If we're in sync pass > 1, then we continue using this
  * metaslab so that we don't dirty more block and cause more sync passes.
  */
 void
 metaslab_segment_may_passivate(metaslab_t *msp)
 {
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
 		return;
 
 	/*
 	 * Since we are in the middle of a sync pass, the most accurate
 	 * information that is accessible to us is the in-core range tree
 	 * histogram; calculate the new weight based on that information.
 	 */
 	uint64_t weight = metaslab_weight_from_range_tree(msp);
 	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
 	int current_idx = WEIGHT_GET_INDEX(weight);
 
 	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
 		metaslab_passivate(msp, weight);
 }
 
 static void
 metaslab_preload(void *arg)
 {
 	metaslab_t *msp = arg;
 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
 
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
 	(void) metaslab_load(msp);
 	msp->ms_selected_txg = spa_syncing_txg(spa);
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 metaslab_group_preload(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;
 
 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
 		taskq_wait(mg->mg_taskq);
 		return;
 	}
 
 	mutex_enter(&mg->mg_lock);
 
 	/*
 	 * Load the next potential metaslabs
 	 */
 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
 		ASSERT3P(msp->ms_group, ==, mg);
 
 		/*
 		 * We preload only the maximum number of metaslabs specified
 		 * by metaslab_preload_limit. If a metaslab is being forced
 		 * to condense then we preload it too. This will ensure
 		 * that force condensing happens in the next txg.
 		 */
 		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
 			continue;
 		}
 
 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
-		    msp, TQ_SLEEP) != NULL);
+		    msp, TQ_SLEEP) != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
 
 /*
  * Determine if the space map's on-disk footprint is past our tolerance
  * for inefficiency. We would like to use the following criteria to make
  * our decision:
  *
  * 1. The size of the space map object should not dramatically increase as a
  * result of writing out the free space range tree.
  *
  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
  * times the size than the free space range tree representation
  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1MB).
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
  * zfs_metaslab_condense_block_threshold - we only condense if the space used
  * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	/*
 	 * Allocations and frees in early passes are generally more space
 	 * efficient (in terms of blocks described in space map entries)
 	 * than the ones in later passes (e.g. we don't compress after
 	 * sync pass 5) and condensing a metaslab multiple times in a txg
 	 * could degrade performance.
 	 *
 	 * Thus we prefer condensing each metaslab at most once every txg at
 	 * the earliest sync pass possible. If a metaslab is eligible for
 	 * condensing again after being considered for condensing within the
 	 * same txg, it will hopefully be dirty in the next txg where it will
 	 * be condensed at an earlier pass.
 	 */
 	if (msp->ms_condense_checked_txg == current_txg)
 		return (B_FALSE);
 	msp->ms_condense_checked_txg = current_txg;
 
 	/*
 	 * We always condense metaslabs that are empty and metaslabs for
 	 * which a condense request has been made.
 	 */
 	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
 	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
 	uint64_t object_size = space_map_length(msp->ms_sm);
 	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
 	    msp->ms_allocatable, SM_NO_VDEVID);
 
 	dmu_object_info_t doi;
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
 	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
 	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
  * Condense the on-disk space map representation to its minimized form.
  * The minimized form consists of a small number of allocations followed by
  * the entries of the free range tree.
  */
 static void
 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 {
 	range_tree_t *condense_tree;
 	space_map_t *sm = msp->ms_sm;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
 
 	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
 	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
 	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
 	    msp->ms_group->mg_vd->vdev_spa->spa_name,
 	    space_map_length(msp->ms_sm),
 	    avl_numnodes(&msp->ms_allocatable->rt_root),
 	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
 
 	msp->ms_condense_wanted = B_FALSE;
 
 	/*
 	 * Create an range tree that is 100% allocated. We remove segments
 	 * that have been freed in this txg, any deferred frees that exist,
 	 * and any allocation in the future. Removing segments should be
 	 * a relatively inexpensive operation since we expect these trees to
 	 * have a small number of nodes.
 	 */
 	condense_tree = range_tree_create(NULL, NULL);
 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
 
 	range_tree_walk(msp->ms_freeing, range_tree_remove, condense_tree);
 	range_tree_walk(msp->ms_freed, range_tree_remove, condense_tree);
 
 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 		range_tree_walk(msp->ms_defer[t],
 		    range_tree_remove, condense_tree);
 	}
 
 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
 		    range_tree_remove, condense_tree);
 	}
 
 	/*
 	 * We're about to drop the metaslab's lock thus allowing
 	 * other consumers to change it's content. Set the
 	 * metaslab's ms_condensing flag to ensure that
 	 * allocations on this metaslab do not occur while we're
 	 * in the middle of committing it to disk. This is only critical
 	 * for ms_allocatable as all other range trees use per txg
 	 * views of their content.
 	 */
 	msp->ms_condensing = B_TRUE;
 
 	mutex_exit(&msp->ms_lock);
 	space_map_truncate(sm, zfs_metaslab_sm_blksz, tx);
 
 	/*
 	 * While we would ideally like to create a space map representation
 	 * that consists only of allocation records, doing so can be
 	 * prohibitively expensive because the in-core free tree can be
 	 * large, and therefore computationally expensive to subtract
 	 * from the condense_tree. Instead we sync out two trees, a cheap
 	 * allocation only tree followed by the in-core free tree. While not
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
 	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
 	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
 
 /*
  * Write a metaslab to disk in the context of the specified transaction group.
  */
 void
 metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa_meta_objset(spa);
 	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(msp->ms_sm);
 
 	ASSERT(!vd->vdev_ishole);
 
 	/*
 	 * This metaslab has just been added so there's no work to do now.
 	 */
 	if (msp->ms_freeing == NULL) {
 		ASSERT3P(alloctree, ==, NULL);
 		return;
 	}
 
 	ASSERT3P(alloctree, !=, NULL);
 	ASSERT3P(msp->ms_freeing, !=, NULL);
 	ASSERT3P(msp->ms_freed, !=, NULL);
 	ASSERT3P(msp->ms_checkpointing, !=, NULL);
 
 	/*
 	 * Normally, we don't want to process a metaslab if there are no
 	 * allocations or frees to perform. However, if the metaslab is being
 	 * forced to condense and it's loaded, we need to let it through.
 	 */
 	if (range_tree_is_empty(alloctree) &&
 	    range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing) &&
 	    !(msp->ms_loaded && msp->ms_condense_wanted))
 		return;
 
 
 	VERIFY(txg <= spa_final_dirty_txg(spa));
 
 	/*
 	 * The only state that can actually be changing concurrently
 	 * with metaslab_sync() is the metaslab's ms_allocatable. No
 	 * other thread can be modifying this txg's alloc, freeing,
 	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
 	 * could call into the DMU, because the DMU can call down to
 	 * us (e.g. via zio_free()) at any time.
 	 *
 	 * The spa_vdev_remove_thread() can be reading metaslab state
 	 * concurrently, and it is locked out by the ms_sync_lock.
 	 * Note that the ms_lock is insufficient for this, because it
 	 * is dropped by space_map_write().
 	 */
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (msp->ms_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_metaslab_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
 		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
 
 		ASSERT(msp->ms_sm != NULL);
 		ASSERT0(metaslab_allocated_space(msp));
 	}
 
 	if (!range_tree_is_empty(msp->ms_checkpointing) &&
 	    vd->vdev_checkpoint_sm == NULL) {
 		ASSERT(spa_has_checkpoint(spa));
 
 		uint64_t new_object = space_map_alloc(mos,
 		    vdev_standard_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
 		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * We save the space map object as an entry in vdev_top_zap
 		 * so it can be retrieved when the pool is reopened after an
 		 * export or through zdb.
 		 */
 		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (new_object), 1, &new_object, tx));
 	}
 
 	mutex_enter(&msp->ms_sync_lock);
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * Note: metaslab_condense() clears the space map's histogram.
 	 * Therefore we must verify and remove this histogram before
 	 * condensing.
 	 */
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 	metaslab_group_histogram_remove(mg, msp);
 
 	if (msp->ms_loaded && metaslab_should_condense(msp)) {
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
 		    SM_NO_VDEVID, tx);
 		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
 		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
 	msp->ms_allocated_space += range_tree_space(alloctree);
 	ASSERT3U(msp->ms_allocated_space, >=,
 	    range_tree_space(msp->ms_freeing));
 	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
 
 	if (!range_tree_is_empty(msp->ms_checkpointing)) {
 		ASSERT(spa_has_checkpoint(spa));
 		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 		/*
 		 * Since we are doing writes to disk and the ms_checkpointing
 		 * tree won't be changing during that time, we drop the
 		 * ms_lock while writing to the checkpoint space map.
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
 		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 
 		spa->spa_checkpoint_info.sci_dspace +=
 		    range_tree_space(msp->ms_checkpointing);
 		vd->vdev_stat.vs_checkpoint_space +=
 		    range_tree_space(msp->ms_checkpointing);
 		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
 		    -space_map_allocated(vd->vdev_checkpoint_sm));
 
 		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
 	}
 
 	if (msp->ms_loaded) {
 		/*
 		 * When the space map is loaded, we have an accurate
 		 * histogram in the range tree. This gives us an opportunity
 		 * to bring the space map's histogram up-to-date so we clear
 		 * it first before updating it.
 		 */
 		space_map_histogram_clear(msp->ms_sm);
 		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
 
 		/*
 		 * Since we've cleared the histogram we need to add back
 		 * any free space that has already been processed, plus
 		 * any deferred space. This allows the on-disk histogram
 		 * to accurately reflect all free space even if some space
 		 * is not yet available for allocation (i.e. deferred).
 		 */
 		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
 
 		/*
 		 * Add back any deferred free space that has not been
 		 * added back into the in-core free tree yet. This will
 		 * ensure that we don't end up with a space map histogram
 		 * that is completely empty unless the metaslab is fully
 		 * allocated.
 		 */
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			space_map_histogram_add(msp->ms_sm,
 			    msp->ms_defer[t], tx);
 		}
 	}
 
 	/*
 	 * Always add the free space from this sync pass to the space
 	 * map histogram. We want to make sure that the on-disk histogram
 	 * accounts for all free space. If the space map is not loaded,
 	 * then we will lose some accuracy but will correct it the next
 	 * time we load the space map.
 	 */
 	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
 	metaslab_aux_histograms_update(msp);
 
 	metaslab_group_histogram_add(mg, msp);
 	metaslab_group_histogram_verify(mg);
 	metaslab_class_histogram_verify(mg->mg_class);
 
 	/*
 	 * For sync pass 1, we avoid traversing this txg's free range tree
 	 * and instead will just swap the pointers for freeing and freed.
 	 * We can safely do this since the freed_tree is guaranteed to be
 	 * empty on the initial pass.
 	 */
 	if (spa_sync_pass(spa) == 1) {
 		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
 		ASSERT0(msp->ms_allocated_this_txg);
 	} else {
 		range_tree_vacate(msp->ms_freeing,
 		    range_tree_add, msp->ms_freed);
 	}
 	msp->ms_allocated_this_txg += range_tree_space(alloctree);
 	range_tree_vacate(alloctree, NULL, NULL);
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
 	    & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	mutex_exit(&msp->ms_lock);
 
 	if (object != space_map_object(msp->ms_sm)) {
 		object = space_map_object(msp->ms_sm);
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    msp->ms_id, sizeof (uint64_t), &object, tx);
 	}
 	mutex_exit(&msp->ms_sync_lock);
 	dmu_tx_commit(tx);
 }
 
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
  */
 void
 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t **defer_tree;
 	int64_t alloc_delta, defer_delta;
 	boolean_t defer_allowed = B_TRUE;
 
 	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
 	/*
 	 * If this metaslab is just becoming available, initialize its
 	 * range trees and add its capacity to the vdev.
 	 */
 	if (msp->ms_freed == NULL) {
 		for (int t = 0; t < TXG_SIZE; t++) {
 			ASSERT(msp->ms_allocating[t] == NULL);
 
 			msp->ms_allocating[t] = range_tree_create(NULL, NULL);
 		}
 
 		ASSERT3P(msp->ms_freeing, ==, NULL);
 		msp->ms_freeing = range_tree_create(NULL, NULL);
 
 		ASSERT3P(msp->ms_freed, ==, NULL);
 		msp->ms_freed = range_tree_create(NULL, NULL);
 
 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
 			ASSERT(msp->ms_defer[t] == NULL);
 
 			msp->ms_defer[t] = range_tree_create(NULL, NULL);
 		}
 
 		ASSERT3P(msp->ms_checkpointing, ==, NULL);
 		msp->ms_checkpointing = range_tree_create(NULL, NULL);
 
 		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
 	}
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
 
 	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
 	    metaslab_class_get_alloc(spa_normal_class(spa));
 	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
 		defer_allowed = B_FALSE;
 	}
 
 	defer_delta = 0;
 	alloc_delta = msp->ms_allocated_this_txg -
 	    range_tree_space(msp->ms_freed);
 	if (defer_allowed) {
 		defer_delta = range_tree_space(msp->ms_freed) -
 		    range_tree_space(*defer_tree);
 	} else {
 		defer_delta -= range_tree_space(*defer_tree);
 	}
 
 	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
 	    defer_delta, 0);
 
 	/*
 	 * If there's a metaslab_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
 	 */
 	metaslab_load_wait(msp);
 
 	/*
 	 * Move the frees from the defer_tree back to the free
 	 * range tree (if it's loaded). Swap the freed_tree and
 	 * the defer_tree -- this is safe to do because we've
 	 * just emptied out the defer_tree.
 	 */
 	range_tree_vacate(*defer_tree,
 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
 	if (defer_allowed) {
 		range_tree_swap(&msp->ms_freed, defer_tree);
 	} else {
 		range_tree_vacate(msp->ms_freed,
 		    msp->ms_loaded ? range_tree_add : NULL,
 		    msp->ms_allocatable);
 	}
 
 	msp->ms_synced_length = space_map_length(msp->ms_sm);
 
 	msp->ms_deferspace += defer_delta;
 	ASSERT3S(msp->ms_deferspace, >=, 0);
 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
 	if (msp->ms_deferspace != 0) {
 		/*
 		 * Keep syncing this metaslab until all deferred frees
 		 * are back in circulation.
 		 */
 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
 	}
 	metaslab_aux_histograms_update_done(msp, defer_allowed);
 
 	if (msp->ms_new) {
 		msp->ms_new = B_FALSE;
 		mutex_enter(&mg->mg_lock);
 		mg->mg_ms_ready++;
 		mutex_exit(&mg->mg_lock);
 	}
 
 	/*
 	 * Re-sort metaslab within its group now that we've adjusted
 	 * its allocatable space.
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
 	/*
 	 * If the metaslab is loaded and we've not tried to load or allocate
 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
 	 */
 	if (msp->ms_loaded &&
 	    msp->ms_initializing == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
 			    msp->ms_allocating[(txg + t) & TXG_MASK]));
 		}
 		if (msp->ms_allocator != -1) {
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_ACTIVE_MASK);
 		}
 
 		if (!metaslab_debug_unload)
 			metaslab_unload(msp);
 	}
 
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
 	ASSERT0(range_tree_space(msp->ms_checkpointing));
 
 	msp->ms_allocated_this_txg = 0;
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	spa_t *spa = mg->mg_class->mc_spa;
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 	metaslab_group_alloc_update(mg);
 	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 	/*
 	 * Preload the next potential metaslabs but only on active
 	 * metaslab groups. We can get into a state where the metaslab
 	 * is no longer active since we dirty metaslabs as we remove a
 	 * a device, thus potentially making the metaslab group eligible
 	 * for preloading.
 	 */
 	if (mg->mg_activation_count > 0) {
 		metaslab_group_preload(mg);
 	}
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 }
 
 /*
  * When writing a ditto block (i.e. more than one DVA for a given BP) on
  * the same vdev as an existing DVA of this BP, then try to allocate it
  * on a different metaslab than existing DVAs (i.e. a unique metaslab).
  */
 static boolean_t
 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
 {
 	uint64_t dva_ms_id;
 
 	if (DVA_GET_ASIZE(dva) == 0)
 		return (B_TRUE);
 
 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
 		return (B_TRUE);
 
 	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
 
 	return (msp->ms_id != dva_ms_id);
 }
 
 /*
  * ==========================================================================
  * Metaslab allocation tracing facility
  * ==========================================================================
  */
 kstat_t *metaslab_trace_ksp;
 kstat_named_t metaslab_trace_over_limit;
 
 void
 metaslab_alloc_trace_init(void)
 {
 	ASSERT(metaslab_alloc_trace_cache == NULL);
 	metaslab_alloc_trace_cache = kmem_cache_create(
 	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
 	    0, NULL, NULL, NULL, NULL, NULL, 0);
 	metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats",
 	    "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL);
 	if (metaslab_trace_ksp != NULL) {
 		metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit;
 		kstat_named_init(&metaslab_trace_over_limit,
 		    "metaslab_trace_over_limit", KSTAT_DATA_UINT64);
 		kstat_install(metaslab_trace_ksp);
 	}
 }
 
 void
 metaslab_alloc_trace_fini(void)
 {
 	if (metaslab_trace_ksp != NULL) {
 		kstat_delete(metaslab_trace_ksp);
 		metaslab_trace_ksp = NULL;
 	}
 	kmem_cache_destroy(metaslab_alloc_trace_cache);
 	metaslab_alloc_trace_cache = NULL;
 }
 
 /*
  * Add an allocation trace element to the allocation tracing list.
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
     metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
     int allocator)
 {
 	if (!metaslab_trace_enabled)
 		return;
 
 	/*
 	 * When the tracing list reaches its maximum we remove
 	 * the second element in the list before adding a new one.
 	 * By removing the second element we preserve the original
 	 * entry as a clue to what allocations steps have already been
 	 * performed.
 	 */
 	if (zal->zal_size == metaslab_trace_max_entries) {
 		metaslab_alloc_trace_t *mat_next;
 #ifdef DEBUG
 		panic("too many entries in allocation list");
 #endif
 		atomic_inc_64(&metaslab_trace_over_limit.value.ui64);
 		zal->zal_size--;
 		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
 		list_remove(&zal->zal_list, mat_next);
 		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
 	}
 
 	metaslab_alloc_trace_t *mat =
 	    kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
 	list_link_init(&mat->mat_list_node);
 	mat->mat_mg = mg;
 	mat->mat_msp = msp;
 	mat->mat_size = psize;
 	mat->mat_dva_id = dva_id;
 	mat->mat_offset = offset;
 	mat->mat_weight = 0;
 	mat->mat_allocator = allocator;
 
 	if (msp != NULL)
 		mat->mat_weight = msp->ms_weight;
 
 	/*
 	 * The list is part of the zio so locking is not required. Only
 	 * a single thread will perform allocations for a given zio.
 	 */
 	list_insert_tail(&zal->zal_list, mat);
 	zal->zal_size++;
 
 	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
 }
 
 void
 metaslab_trace_init(zio_alloc_list_t *zal)
 {
 	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
 	    offsetof(metaslab_alloc_trace_t, mat_list_node));
 	zal->zal_size = 0;
 }
 
 void
 metaslab_trace_fini(zio_alloc_list_t *zal)
 {
 	metaslab_alloc_trace_t *mat;
 
 	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
 		kmem_cache_free(metaslab_alloc_trace_cache, mat);
 	list_destroy(&zal->zal_list);
 	zal->zal_size = 0;
 }
 
 /*
  * ==========================================================================
  * Metaslab block operations
  * ==========================================================================
  */
 
 static void
 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
     int allocator)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
 }
 
 static void
 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
 {
 	uint64_t max = mg->mg_max_alloc_queue_depth;
 	uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
 	while (cur < max) {
 		if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
 		    cur, cur + 1) == cur) {
 			atomic_inc_64(
 			    &mg->mg_class->mc_alloc_max_slots[allocator]);
 			return;
 		}
 		cur = mg->mg_cur_max_alloc_queue_depth[allocator];
 	}
 }
 
 void
 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
     int allocator, boolean_t io_complete)
 {
 	if (!(flags & METASLAB_ASYNC_ALLOC) ||
 	    (flags & METASLAB_DONT_THROTTLE))
 		return;
 
 	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 	if (!mg->mg_class->mc_alloc_throttle_enabled)
 		return;
 
 	(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
 	if (io_complete)
 		metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
     int allocator)
 {
 #ifdef ZFS_DEBUG
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	for (int d = 0; d < ndvas; d++) {
 		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
 		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
 		VERIFY(zfs_refcount_not_held(
 		    &mg->mg_alloc_queue_depth[allocator], tag));
 	}
 #endif
 }
 
 static uint64_t
 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
 {
 	uint64_t start;
 	range_tree_t *rt = msp->ms_allocatable;
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(msp->ms_initializing);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
 		metaslab_group_t *mg = msp->ms_group;
 		vdev_t *vd = mg->mg_vd;
 
 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
 		range_tree_remove(rt, start, size);
 
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
 
 		/* Track the last successful allocation */
 		msp->ms_alloc_txg = txg;
 		metaslab_verify_space(msp, txg);
 	}
 
 	/*
 	 * Now that we've attempted the allocation we need to update the
 	 * metaslab's maximum block size since it may have changed.
 	 */
 	msp->ms_max_size = metaslab_block_maxsize(msp);
 	return (start);
 }
 
 /*
  * Find the metaslab with the highest weight that is less than what we've
  * already tried.  In the common case, this means that we will examine each
  * metaslab at most once. Note that concurrent callers could reorder metaslabs
  * by activation/passivation once we have dropped the mg_lock. If a metaslab is
  * activated by another thread, and we fail to allocate from the metaslab we
  * have selected, we may not try the newly-activated metaslab, and instead
  * activate another metaslab.  This is not optimal, but generally does not cause
  * any problems (a possible exception being if every metaslab is completely full
  * except for the the newly-activated metaslab which we fail to examine).
  */
 static metaslab_t *
 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
     dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
     zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
 {
 	avl_index_t idx;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	metaslab_t *msp = avl_find(t, search, &idx);
 	if (msp == NULL)
 		msp = avl_nearest(t, idx, AVL_AFTER);
 
 	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
 		int i;
 		if (!metaslab_should_allocate(msp, asize)) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			continue;
 		}
 
 		/*
 		 * If the selected metaslab is condensing or being
 		 * initialized, skip it.
 		 */
 		if (msp->ms_condensing || msp->ms_initializing > 0)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
 		/*
 		 * If we're activating as primary, this is our first allocation
 		 * from this disk, so we don't need to check how close we are.
 		 * If the metaslab under consideration was already active,
 		 * we're getting desperate enough to steal another allocator's
 		 * metaslab, so we still don't care about distances.
 		 */
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
 			break;
 
 		for (i = 0; i < d; i++) {
 			if (want_unique &&
 			    !metaslab_is_unique(msp, &dva[i]))
 				break;  /* try another metaslab */
 		}
 		if (i == d)
 			break;
 	}
 
 	if (msp != NULL) {
 		search->ms_weight = msp->ms_weight;
 		search->ms_start = msp->ms_start + 1;
 		search->ms_allocator = msp->ms_allocator;
 		search->ms_primary = msp->ms_primary;
 	}
 	return (msp);
 }
 
 /* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
     int d, int allocator)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	uint64_t activation_weight;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
 	for (int i = 0; i < d; i++) {
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_CLAIM;
 			break;
 		}
 	}
 
 	/*
 	 * If we don't have enough metaslabs active to fill the entire array, we
 	 * just use the 0th slot.
 	 */
 	if (mg->mg_ms_ready < mg->mg_allocators * 3)
 		allocator = 0;
 
 	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
 
 	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
 	search->ms_weight = UINT64_MAX;
 	search->ms_start = 0;
 	/*
 	 * At the end of the metaslab tree are the already-active metaslabs,
 	 * first the primaries, then the secondaries. When we resume searching
 	 * through the tree, we need to consider ms_allocator and ms_primary so
 	 * we start in the location right after where we left off, and don't
 	 * accidentally loop forever considering the same metaslabs.
 	 */
 	search->ms_allocator = -1;
 	search->ms_primary = B_TRUE;
 	for (;;) {
 		boolean_t was_active = B_FALSE;
 
 		mutex_enter(&mg->mg_lock);
 
 		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
 		    mg->mg_primaries[allocator] != NULL) {
 			msp = mg->mg_primaries[allocator];
 			was_active = B_TRUE;
 		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
 		    mg->mg_secondaries[allocator] != NULL) {
 			msp = mg->mg_secondaries[allocator];
 			was_active = B_TRUE;
 		} else {
 			msp = find_valid_metaslab(mg, activation_weight, dva, d,
 			    want_unique, asize, allocator, zal, search,
 			    &was_active);
 		}
 
 		mutex_exit(&mg->mg_lock);
 		if (msp == NULL) {
 			kmem_free(search, sizeof (*search));
 			return (-1ULL);
 		}
 
 		mutex_enter(&msp->ms_lock);
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock. We check the
 		 * active status first to see if we need to reselect
 		 * a new metaslab.
 		 */
 		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		/*
 		 * If the metaslab is freshly activated for an allocator that
 		 * isn't the one we're allocating from, or if it's a primary and
 		 * we're seeking a secondary (or vice versa), we go back and
 		 * select a new metaslab.
 		 */
 		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    (msp->ms_allocator != -1) &&
 		    (msp->ms_allocator != allocator || ((activation_weight ==
 		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
 		    activation_weight != METASLAB_WEIGHT_CLAIM) {
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_WEIGHT_CLAIM);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		if (metaslab_activate(msp, allocator, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		msp->ms_selected_txg = txg;
 
 		/*
 		 * Now that we have the lock, recheck to see if we should
 		 * continue to use this metaslab for this allocation. The
 		 * the metaslab is now loaded so metaslab_should_allocate() can
 		 * accurately determine if the allocation attempt should
 		 * proceed.
 		 */
 		if (!metaslab_should_allocate(msp, asize)) {
 			/* Passivate this metaslab and select a new one. */
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_TOO_SMALL, allocator);
 			goto next;
 		}
 
 		/*
 		 * If this metaslab is currently condensing then pick again as
 		 * we can't manipulate this metaslab until it's committed
 		 * to disk. If this metaslab is being initialized, we shouldn't
 		 * allocate from it since the allocated region might be
 		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		} else if (msp->ms_initializing > 0) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_INITIALIZING, allocator);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
 		offset = metaslab_block_alloc(msp, asize, txg);
 		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
 		if (offset != -1ULL) {
 			/* Proactively passivate the metaslab, if needed */
 			metaslab_segment_may_passivate(msp);
 			break;
 		}
 next:
 		ASSERT(msp->ms_loaded);
 
 		/*
 		 * We were unable to allocate from this metaslab so determine
 		 * a new weight for this metaslab. Now that we have loaded
 		 * the metaslab we can provide a better hint to the metaslab
 		 * selector.
 		 *
 		 * For space-based metaslabs, we use the maximum block size.
 		 * This information is only available when the metaslab
 		 * is loaded and is more accurate than the generic free
 		 * space weight that was calculated by metaslab_weight().
 		 * This information allows us to quickly compare the maximum
 		 * available allocation in the metaslab to the allocation
 		 * size being requested.
 		 *
 		 * For segment-based metaslabs, determine the new weight
 		 * based on the highest bucket in the range tree. We
 		 * explicitly use the loaded segment weight (i.e. the range
 		 * tree histogram) since it contains the space that is
 		 * currently available for allocation and is accurate
 		 * even within a sync pass.
 		 */
 		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
 			uint64_t weight = metaslab_block_maxsize(msp);
 			WEIGHT_SET_SPACEBASED(weight);
 			metaslab_passivate(msp, weight);
 		} else {
 			metaslab_passivate(msp,
 			    metaslab_weight_from_range_tree(msp));
 		}
 
 		/*
 		 * We have just failed an allocation attempt, check
 		 * that metaslab_should_allocate() agrees. Otherwise,
 		 * we may end up in an infinite loop retrying the same
 		 * metaslab.
 		 */
 		ASSERT(!metaslab_should_allocate(msp, asize));
 
 		mutex_exit(&msp->ms_lock);
 	}
 	mutex_exit(&msp->ms_lock);
 	kmem_free(search, sizeof (*search));
 	return (offset);
 }
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
     uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva,
     int d, int allocator)
 {
 	uint64_t offset;
 	ASSERT(mg->mg_initialized);
 
 	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
 	    dva, d, allocator);
 
 	mutex_enter(&mg->mg_lock);
 	if (offset == -1ULL) {
 		mg->mg_failed_allocations++;
 		metaslab_trace_add(zal, mg, NULL, asize, d,
 		    TRACE_GROUP_FAILURE, allocator);
 		if (asize == SPA_GANGBLOCKSIZE) {
 			/*
 			 * This metaslab group was unable to allocate
 			 * the minimum gang block size so it must be out of
 			 * space. We must notify the allocation throttle
 			 * to start skipping allocation attempts to this
 			 * metaslab group until more space becomes available.
 			 * Note: this failure cannot be caused by the
 			 * allocation throttle since the allocation throttle
 			 * is only responsible for skipping devices and
 			 * not failing block allocations.
 			 */
 			mg->mg_no_free_space = B_TRUE;
 		}
 	}
 	mg->mg_allocations++;
 	mutex_exit(&mg->mg_lock);
 	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
     zio_alloc_list_t *zal, int allocator)
 {
 	metaslab_group_t *mg, *rotor;
 	vdev_t *vd;
 	boolean_t try_hard = B_FALSE;
 
 	ASSERT(!DVA_IS_VALID(&dva[d]));
 
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 * This will also test spilling from special to normal.
 	 */
 	if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
 		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
 		    allocator);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
 	 * Note that there's no locking on mc_rotor or mc_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
 	 * If we are doing ditto or log blocks, try to spread them across
 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
 	 * allocated all of our ditto blocks, then try and spread them out on
 	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
 	 * able to reason about.  Otherwise, any two top-level vdev failures
 	 * will guarantee the loss of data.  With consecutive allocation,
 	 * only two adjacent top-level vdev failures will result in data loss.
 	 *
 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
 	 * ourselves on the same vdev as our gang block header.  That
 	 * way, we can hope for locality in vdev_cache, plus it makes our
 	 * fault domains something tractable.
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
 
 		/*
 		 * It's possible the vdev we're using as the hint no
 		 * longer exists or its mg has been closed (e.g. by
 		 * device removal).  Consult the rotor when
 		 * all else fails.
 		 */
 		if (vd != NULL && vd->vdev_mg != NULL) {
 			mg = vd->vdev_mg;
 
 			if (flags & METASLAB_HINTBP_AVOID &&
 			    mg->mg_next != NULL)
 				mg = mg->mg_next;
 		} else {
 			mg = mc->mc_rotor;
 		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
 	} else {
 		ASSERT(mc->mc_rotor != NULL);
 		mg = mc->mc_rotor;
 	}
 
 	/*
 	 * If the hint put us into the wrong metaslab class, or into a
 	 * metaslab group that has been passivated, just follow the rotor.
 	 */
 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mc->mc_rotor;
 
 	rotor = mg;
 top:
 	do {
 		boolean_t allocatable;
 
 		ASSERT(mg->mg_activation_count == 1);
 		vd = mg->mg_vd;
 
 		/*
 		 * Don't allocate from faulted devices.
 		 */
 		if (try_hard) {
 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 			allocatable = vdev_allocatable(vd);
 			spa_config_exit(spa, SCL_ZIO, FTAG);
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
 
 		/*
 		 * Determine if the selected metaslab group is eligible
 		 * for allocations. If we're ganging then don't allow
 		 * this metaslab group to skip allocations since that would
 		 * inadvertently return ENOSPC and suspend the pool
 		 * even though space is still available.
 		 */
 		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
 			allocatable = metaslab_group_allocatable(mg, rotor,
 			    psize, allocator);
 		}
 
 		if (!allocatable) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_NOT_ALLOCATABLE, allocator);
 			goto next;
 		}
 
 		ASSERT(mg->mg_initialized);
 
 		/*
 		 * Avoid writing single-copy data to a failing,
 		 * non-redundant vdev, unless we've already tried all
 		 * other vdevs.
 		 */
 		if ((vd->vdev_stat.vs_write_errors > 0 ||
 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
 		    d == 0 && !try_hard && vd->vdev_children == 0) {
 			metaslab_trace_add(zal, mg, NULL, psize, d,
 			    TRACE_VDEV_ERROR, allocator);
 			goto next;
 		}
 
 		ASSERT(mg->mg_class == mc);
 
 		uint64_t asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
 		/*
 		 * If we don't need to try hard, then require that the
 		 * block be on an different metaslab from any other DVAs
 		 * in this BP (unique=true).  If we are trying hard, then
 		 * allow any metaslab to be used (unique=false).
 		 */
 		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
 		    !try_hard, dva, d, allocator);
 
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
 			if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
 				 * Calculate how much more or less we should
 				 * try to allocate from this device during
 				 * this iteration around the rotor.
 				 * For example, if a device is 80% full
 				 * and the pool is 20% full then we should
 				 * reduce allocations by 60% on this device.
 				 *
 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
 				 *
 				 * This reduces allocations by 307K for this
 				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
 				    (int64_t)mg->mg_aliquot) / 100;
 			} else if (!metaslab_bias_enabled) {
 				mg->mg_bias = 0;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
 				mc->mc_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
 			DVA_SET_OFFSET(&dva[d], offset);
 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
 			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
 next:
 		mc->mc_rotor = mg->mg_next;
 		mc->mc_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	/*
 	 * If we haven't tried hard, do so now.
 	 */
 	if (!try_hard) {
 		try_hard = B_TRUE;
 		goto top;
 	}
 
 	bzero(&dva[d], sizeof (dva_t));
 
 	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
 	return (SET_ERROR(ENOSPC));
 }
 
 void
 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
     boolean_t checkpoint)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
 
 	metaslab_check_free_impl(vd, offset, asize);
 
 	mutex_enter(&msp->ms_lock);
 	if (range_tree_is_empty(msp->ms_freeing) &&
 	    range_tree_is_empty(msp->ms_checkpointing)) {
 		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
 	}
 
 	if (checkpoint) {
 		ASSERT(spa_has_checkpoint(spa));
 		range_tree_add(msp->ms_checkpointing, offset, asize);
 	} else {
 		range_tree_add(msp->ms_freeing, offset, asize);
 	}
 	mutex_exit(&msp->ms_lock);
 }
 
 /* ARGSUSED */
 void
 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	boolean_t *checkpoint = arg;
 
 	ASSERT3P(checkpoint, !=, NULL);
 
 	if (vd->vdev_ops->vdev_op_remap != NULL)
 		vdev_indirect_mark_obsolete(vd, offset, size);
 	else
 		metaslab_free_impl(vd, offset, size, *checkpoint);
 }
 
 static void
 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
     boolean_t checkpoint)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
 		return;
 
 	if (spa->spa_vdev_removal != NULL &&
 	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
 	    vdev_is_concrete(vd)) {
 		/*
 		 * Note: we check if the vdev is concrete because when
 		 * we complete the removal, we first change the vdev to be
 		 * an indirect vdev (in open context), and then (in syncing
 		 * context) clear spa_vdev_removal.
 		 */
 		free_from_removing_vdev(vd, offset, size);
 	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vdev_indirect_mark_obsolete(vd, offset, size);
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_free_impl_cb, &checkpoint);
 	} else {
 		metaslab_free_concrete(vd, offset, size, checkpoint);
 	}
 }
 
 typedef struct remap_blkptr_cb_arg {
 	blkptr_t *rbca_bp;
 	spa_remap_cb_t rbca_cb;
 	vdev_t *rbca_remap_vd;
 	uint64_t rbca_remap_offset;
 	void *rbca_cb_arg;
 } remap_blkptr_cb_arg_t;
 
 void
 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	remap_blkptr_cb_arg_t *rbca = arg;
 	blkptr_t *bp = rbca->rbca_bp;
 
 	/* We can not remap split blocks. */
 	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
 		return;
 	ASSERT0(inner_offset);
 
 	if (rbca->rbca_cb != NULL) {
 		/*
 		 * At this point we know that we are not handling split
 		 * blocks and we invoke the callback on the previous
 		 * vdev which must be indirect.
 		 */
 		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
 		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
 
 		/* set up remap_blkptr_cb_arg for the next call */
 		rbca->rbca_remap_vd = vd;
 		rbca->rbca_remap_offset = offset;
 	}
 
 	/*
 	 * The phys birth time is that of dva[0].  This ensures that we know
 	 * when each dva was written, so that resilver can determine which
 	 * blocks need to be scrubbed (i.e. those written during the time
 	 * the vdev was offline).  It also ensures that the key used in
 	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
 	 * we didn't change the phys_birth, a lookup in the ARC for a
 	 * remapped BP could find the data that was previously stored at
 	 * this vdev + offset.
 	 */
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
 	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
 }
 
 /*
  * If the block pointer contains any indirect DVAs, modify them to refer to
  * concrete DVAs.  Note that this will sometimes not be possible, leaving
  * the indirect DVA in place.  This happens if the indirect DVA spans multiple
  * segments in the mapping (i.e. it is a "split block").
  *
  * If the BP was remapped, calls the callback on the original dva (note the
  * callback can be called multiple times if the original indirect DVA refers
  * to another indirect DVA, etc).
  *
  * Returns TRUE if the BP was remapped.
  */
 boolean_t
 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
 {
 	remap_blkptr_cb_arg_t rbca;
 
 	if (!zfs_remap_blkptr_enable)
 		return (B_FALSE);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
 		return (B_FALSE);
 
 	/*
 	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
 	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
 	 */
 	if (BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	/*
 	 * Gang blocks can not be remapped, because
 	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
 	 * the BP used to read the gang block header (GBH) being the same
 	 * as the DVA[0] that we allocated for the GBH.
 	 */
 	if (BP_IS_GANG(bp))
 		return (B_FALSE);
 
 	/*
 	 * Embedded BP's have no DVA to remap.
 	 */
 	if (BP_GET_NDVAS(bp) < 1)
 		return (B_FALSE);
 
 	/*
 	 * Note: we only remap dva[0].  If we remapped other dvas, we
 	 * would no longer know what their phys birth txg is.
 	 */
 	dva_t *dva = &bp->blk_dva[0];
 
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
 
 	if (vd->vdev_ops->vdev_op_remap == NULL)
 		return (B_FALSE);
 
 	rbca.rbca_bp = bp;
 	rbca.rbca_cb = callback;
 	rbca.rbca_remap_vd = vd;
 	rbca.rbca_remap_offset = offset;
 	rbca.rbca_cb_arg = arg;
 
 	/*
 	 * remap_blkptr_cb() will be called in order for each level of
 	 * indirection, until a concrete vdev is reached or a split block is
 	 * encountered. old_vd and old_offset are updated within the callback
 	 * as we go from the one indirect vdev to the next one (either concrete
 	 * or indirect again) in that order.
 	 */
 	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
 
 	/* Check if the DVA wasn't remapped because it is a split block */
 	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Undo the allocation of a DVA which happened in the given transaction group.
  */
 void
 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	metaslab_t *msp;
 	vdev_t *vd;
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (txg > spa_freeze_txg(spa))
 		return;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
 		    (u_longlong_t)vdev, (u_longlong_t)offset);
 		ASSERT(0);
 		return;
 	}
 
 	ASSERT(!vd->vdev_removing);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
 	    offset, size);
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY3U(offset, >=, msp->ms_start);
 	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
 	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
 	    msp->ms_size);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	range_tree_add(msp->ms_allocatable, offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 /*
  * Free the block represented by the given DVA.
  */
 void
 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 
 	ASSERT(DVA_IS_VALID(dva));
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	if (DVA_GET_GANG(dva)) {
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 	}
 
 	metaslab_free_impl(vd, offset, size, checkpoint);
 }
 
 /*
  * Reserve some allocation slots. The reservation system must be called
  * before we call into the allocator. If there aren't any available slots
  * then the I/O will be throttled until an I/O completes and its slots are
  * freed up. The function returns true if it was successful in placing
  * the reservation.
  */
 boolean_t
 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
     zio_t *zio, int flags)
 {
 	uint64_t available_slots = 0;
 	boolean_t slot_reserved = B_FALSE;
 	uint64_t max = mc->mc_alloc_max_slots[allocator];
 
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 
 	uint64_t reserved_slots =
 	    zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
 	if (reserved_slots < max)
 		available_slots = max - reserved_slots;
 
 	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
 	    flags & METASLAB_MUST_RESERVE) {
 		/*
 		 * We reserve the slots individually so that we can unreserve
 		 * them individually when an I/O completes.
 		 */
 		for (int d = 0; d < slots; d++) {
 			reserved_slots =
 			    zfs_refcount_add(&mc->mc_alloc_slots[allocator],
 			    zio);
 		}
 		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
 		slot_reserved = B_TRUE;
 	}
 
 	mutex_exit(&mc->mc_lock);
 	return (slot_reserved);
 }
 
 void
 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
     int allocator, zio_t *zio)
 {
 	ASSERT(mc->mc_alloc_throttle_enabled);
 	mutex_enter(&mc->mc_lock);
 	for (int d = 0; d < slots; d++) {
 		(void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator],
 		    zio);
 	}
 	mutex_exit(&mc->mc_lock);
 }
 
 static int
 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
     uint64_t txg)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
 		return (ENXIO);
 
 	ASSERT3P(vd->vdev_ms, !=, NULL);
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
 		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
 	/*
 	 * No need to fail in that case; someone else has activated the
 	 * metaslab, but that doesn't preclude us from using it.
 	 */
 	if (error == EBUSY)
 		error = 0;
 
 	if (error == 0 &&
 	    !range_tree_contains(msp->ms_allocatable, offset, size))
 		error = SET_ERROR(ENOENT);
 
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
 	}
 
 	VERIFY(!msp->ms_condensing);
 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
 	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
 	    msp->ms_size);
 	range_tree_remove(msp->ms_allocatable, offset, size);
 
 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
 		    offset, size);
 	}
 
 	mutex_exit(&msp->ms_lock);
 
 	return (0);
 }
 
 typedef struct metaslab_claim_cb_arg_t {
 	uint64_t	mcca_txg;
 	int		mcca_error;
 } metaslab_claim_cb_arg_t;
 
 /* ARGSUSED */
 static void
 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	metaslab_claim_cb_arg_t *mcca_arg = arg;
 
 	if (mcca_arg->mcca_error == 0) {
 		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
 		    size, mcca_arg->mcca_txg);
 	}
 }
 
 int
 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
 {
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		metaslab_claim_cb_arg_t arg;
 
 		/*
 		 * Only zdb(1M) can claim on indirect vdevs.  This is used
 		 * to detect leaks of mapped space (that are not accounted
 		 * for in the obsolete counts, spacemap, or bpobj).
 		 */
 		ASSERT(!spa_writeable(vd->vdev_spa));
 		arg.mcca_error = 0;
 		arg.mcca_txg = txg;
 
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_claim_impl_cb, &arg);
 
 		if (arg.mcca_error == 0) {
 			arg.mcca_error = metaslab_claim_concrete(vd,
 			    offset, size, txg);
 		}
 		return (arg.mcca_error);
 	} else {
 		return (metaslab_claim_concrete(vd, offset, size, txg));
 	}
 }
 
 /*
  * Intent log support: upon opening the pool after a crash, notify the SPA
  * of blocks that the intent log has allocated for immediate write, but
  * which are still considered free by the SPA because the last transaction
  * group didn't commit yet.
  */
 static int
 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 {
 	uint64_t vdev = DVA_GET_VDEV(dva);
 	uint64_t offset = DVA_GET_OFFSET(dva);
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 
 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
 		return (SET_ERROR(ENXIO));
 	}
 
 	ASSERT(DVA_IS_VALID(dva));
 
 	if (DVA_GET_GANG(dva))
 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 	return (metaslab_claim_impl(vd, offset, size, txg));
 }
 
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
     zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
 	ASSERT(bp->blk_birth == 0);
 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
 	ASSERT(BP_GET_NDVAS(bp) == 0);
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 	ASSERT3P(zal, !=, NULL);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
 		    txg, flags, zal, allocator);
 		if (error != 0) {
 			for (d--; d >= 0; d--) {
 				metaslab_unalloc_dva(spa, &dva[d], txg);
 				metaslab_group_alloc_decrement(spa,
 				    DVA_GET_VDEV(&dva[d]), zio, flags,
 				    allocator, B_FALSE);
 				bzero(&dva[d], sizeof (dva_t));
 			}
 			spa_config_exit(spa, SCL_ALLOC, FTAG);
 			return (error);
 		} else {
 			/*
 			 * Update the metaslab group's queue depth
 			 * based on the newly allocated dva.
 			 */
 			metaslab_group_alloc_increment(spa,
 			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
 		}
 
 	}
 	ASSERT(error == 0);
 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	BP_SET_BIRTH(bp, txg, txg);
 
 	return (0);
 }
 
 void
 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
 	 * the blocks that we free that are part of the checkpoint won't be
 	 * reused until the checkpoint is discarded or we revert to it.
 	 *
 	 * The checkpoint flag is passed down the metaslab_free code path
 	 * and is set whenever we want to add a block to the checkpoint's
 	 * accounting. That is, we "checkpoint" blocks that existed at the
 	 * time the checkpoint was created and are therefore referenced by
 	 * the checkpointed uberblock.
 	 *
 	 * Note that, we don't checkpoint any blocks if the current
 	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
 	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
 		 * there is no way it was created in the current txg.
 		 */
 		ASSERT(!now);
 		ASSERT3U(spa_syncing_txg(spa), ==, txg);
 		checkpoint = B_TRUE;
 	}
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		if (now) {
 			metaslab_unalloc_dva(spa, &dva[d], txg);
 		} else {
 			ASSERT3U(txg, ==, spa_syncing_txg(spa));
 			metaslab_free_dva(spa, &dva[d], checkpoint);
 		}
 	}
 
 	spa_config_exit(spa, SCL_FREE, FTAG);
 }
 
 int
 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = BP_GET_NDVAS(bp);
 	int error = 0;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (txg != 0) {
 		/*
 		 * First do a dry run to make sure all DVAs are claimable,
 		 * so we don't have to unwind from partial failures below.
 		 */
 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
 			return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
 	for (int d = 0; d < ndvas; d++) {
 		error = metaslab_claim_dva(spa, &dva[d], txg);
 		if (error != 0)
 			break;
 	}
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	ASSERT(error == 0 || txg == 0);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static void
 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	if (vd->vdev_ops == &vdev_indirect_ops)
 		return;
 
 	metaslab_check_free_impl(vd, offset, size);
 }
 
 static void
 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	metaslab_t *msp;
 	spa_t *spa = vd->vdev_spa;
 
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	if (vd->vdev_ops->vdev_op_remap != NULL) {
 		vd->vdev_ops->vdev_op_remap(vd, offset, size,
 		    metaslab_check_free_impl_cb, NULL);
 		return;
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
 	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
 
 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	mutex_enter(&msp->ms_lock);
 	if (msp->ms_loaded) {
 		range_tree_verify_not_present(msp->ms_allocatable,
 		    offset, size);
 	}
 
 	range_tree_verify_not_present(msp->ms_freeing, offset, size);
 	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
 	range_tree_verify_not_present(msp->ms_freed, offset, size);
 	for (int j = 0; j < TXG_DEFER_SIZE; j++)
 		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
 	mutex_exit(&msp->ms_lock);
 }
 
 void
 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
 {
 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
 		return;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		vdev_t *vd = vdev_lookup_top(spa, vdev);
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
 
 		if (DVA_GET_GANG(&bp->blk_dva[i]))
 			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 
 		ASSERT3P(vd, !=, NULL);
 
 		metaslab_check_free_impl(vd, offset, size);
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	(revision 354953)
@@ -1,4319 +1,4319 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /* maximum scrub/resilver I/O queue per leaf vdev */
 int zfs_scrub_limit = 10;
 
 /* default target for number of metaslabs per top-level vdev */
 int zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 int zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 int zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 int zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 int zfs_vdev_max_ms_shift = 34;
 
 boolean_t vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int vdev_standard_sm_blksz = (1 << 17);
 
 int zfs_ashift_min;
 
 /*PRINTFLIKE2*/
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, **opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Derive the enumerated alloction bias from string input.
  * String origin is either the per-vdev zap or zpool(1M).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
 /* ARGSUSED */
 void
 vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
 {
 	res->rs_start = in->rs_start;
 	res->rs_end = in->rs_end;
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
 	/*
 	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
 	 * so each child must provide at least 1/Nth of its asize.
 	 */
 	if (pvd->vdev_ops == &vdev_raidz_ops)
 		return ((pvd->vdev_min_asize + pvd->vdev_children - 1) /
 		    pvd->vdev_children);
 
 	return (pvd->vdev_min_asize);
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	return (vdev_count_leaves_impl(spa->spa_root_vdev));
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 	spa_t *spa = cvd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(cvd->vdev_parent == NULL);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_zalloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		bcopy(pvd->vdev_child, newchild, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 	for (int c = newc = 0; c < oldc; c++) {
 		if ((cvd = pvd->vdev_child[c]) != NULL) {
 			newchild[newc] = cvd;
 			cvd->vdev_id = newc++;
 		}
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_guid(NULL);
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = range_tree_create(NULL, NULL);
 
 	list_link_init(&vd->vdev_leaf_node);
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 	}
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 	vdev_cache_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	char *type;
 	uint64_t guid = 0, islog, nparity;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
 	nparity = -1ULL;
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Previous versions could only support 1 or 2 parity
 			 * device.
 			 */
 			if (nparity > 1 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(ENOTSUP));
 			if (nparity > 2 &&
 			    spa_version(spa) < SPA_VERSION_RAIDZ3)
 				return (SET_ERROR(ENOTSUP));
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
 			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 				return (SET_ERROR(EINVAL));
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
 			 */
 			nparity = 1;
 		}
 	} else {
 		nparity = 0;
 	}
 	ASSERT(nparity != -1ULL);
 
 	/*
 	 * If creating a top-level vdev, check for allocation classes input
 	 */
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		char *bias;
 
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (alloc_bias != VDEV_BIAS_LOG &&
 			    spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vic = &vd->vdev_indirect_config;
 
 	vd->vdev_islog = islog;
 	vd->vdev_nparity = nparity;
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
 		vd->vdev_path = spa_strdup(vd->vdev_path);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
 		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
 		 * valid in the current context.  Local vdevs will
 		 * remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 	ASSERT(vd->vdev_initialize_thread == NULL);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT(vd->vdev_parent == NULL);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 	vdev_cache_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
 	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
 	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
 	ASSERT0(tvd->vdev_removing);
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_removing = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT(mvd->vdev_children == 0);
 	vdev_free(mvd);
 }
 
 static void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd,
 		    spa->spa_alloc_count);
 
 		/*
 		 * The spa ashift values currently only reflect the
 		 * general vdev classes. Class destination is late
 		 * binding so ashift checking had to wait until now
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 		}
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
 		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 	for (m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(mos, vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 #ifndef _KERNEL
 		/*
 		 * To accomodate zdb_leak_init() fake indirect
 		 * metaslabs, we allocate a metaslab group for
 		 * indirect vdevs which normally don't have one.
 		 */
 		if (vd->vdev_mg == NULL) {
 			ASSERT0(vdev_is_concrete(vd));
 			vdev_metaslab_group_create(vd);
 		}
 #endif
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is being removed we don't activate
 	 * the metaslabs since we want to ensure that no new
 	 * allocations are performed on this device.
 	 */
 	if (!expanding && !vd->vdev_removing) {
 		metaslab_group_activate(vd->vdev_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 		metaslab_group_passivate(mg);
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 
 		vd->vdev_ms_count = 0;
 
 		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
 			ASSERT0(mg->mg_histogram[i]);
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, 0, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zio_link_t *zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
 		    ZIO_FLAG_TRYHARD;
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 
 		/*
 		 * We can't change the vdev state in this context, so we
 		 * kick off an async task to do it on our behalf.
 		 */
 		if (zio != NULL) {
 			vd->vdev_probe_wanted = B_TRUE;
 			spa_async_request(spa, SPA_ASYNC_PROBE);
 		}
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
 	    strlen(ZVOL_DIR)) == 0)
 		return (B_TRUE);
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 	return (B_FALSE);
 }
 
 void
 vdev_open_children(vdev_t *vd)
 {
 	taskq_t *tq;
 	int children = vd->vdev_children;
 
 	/*
 	 * in order to handle pools on top of zvols, do the opens
 	 * in a single thread so that the same thread holds the
 	 * spa_namespace_lock
 	 */
 	if (vdev_uses_zvols(vd)) {
 		for (int c = 0; c < children; c++)
 			vd->vdev_child[c]->vdev_open_error =
 			    vdev_open(vd->vdev_child[c]);
 		return;
 	}
 	tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 
 	for (int c = 0; c < children; c++)
 		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
-		    TQ_SLEEP) != NULL);
+		    TQ_SLEEP) != TASKQID_INVALID);
 
 	taskq_destroy(tq);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code
  * in 128k (1 << 17) because it is the "typical" blocksize.
  * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
  * otherwise it would inconsistently account for existing bp's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, ENXIO);
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For testing purposes, a higher ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
 		vd->vdev_ashift = MAX(zfs_ashift_min, vd->vdev_ashift);
 	} else {
 		/*
 		 * Detect if the alignment requirement has increased.
 		 * We don't want to make the pool unavailable, just
 		 * issue a warning instead.
 		 */
 		if (ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			cmn_err(CE_WARN,
 			    "Disk, '%s', has a block alignment that is "
 			    "larger than the pool's alignment\n",
 			    vd->vdev_path);
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the min and max ashift values for normal data devices.
 	 *
 	 * DJB - TBD these should perhaps be tracked per allocation class
 	 * (e.g. spa_min_ashift is used to round up post compression buffers)
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_alloc_bias == VDEV_BIAS_NONE &&
 	    vd->vdev_aux == NULL) {
 		if (vd->vdev_ashift > spa->spa_max_ashift)
 			spa->spa_max_ashift = vd->vdev_ashift;
 		if (vd->vdev_ashift < spa->spa_min_ashift)
 			spa->spa_min_ashift = vd->vdev_ashift;
 	}
 
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
 	 * since this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
 	    vdev_resilver_needed(vd, NULL, NULL))
 		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		if (vdev_validate(vd->vdev_child[c]) != 0)
 			return (SET_ERROR(EBADF));
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
 		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
 			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 			    dvd->vdev_path, svd->vdev_path);
 			spa_strfree(dvd->vdev_path);
 			dvd->vdev_path = spa_strdup(svd->vdev_path);
 		}
 	} else if (svd->vdev_path != NULL) {
 		dvd->vdev_path = spa_strdup(svd->vdev_path);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *pvd = vd->vdev_parent;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache &&
 		    !l2arc_vdev_present(vd))
 			l2arc_add_vdev(spa, vd);
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : ENXIO);
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/* BEGIN CSTYLED */
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size	    metaslab count
 	 *  --------------|-----------------
 	 *	< 8GB		~16
 	 *  8GB   - 100GB	one per 512MB
 	 *  100GB - 3TB		~200
 	 *  3TB   - 2PB		one per 16GB
 	 *	> 2PB		~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 	/* END CSTYLED */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_contains(rt, txg, size))
 		range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * Ignore the DTLs and try all devices.  This avoids a recursive
 	 * mutex enter on the vdev_dtl_lock, and also makes us try hard
 	 * when loading the pool (relying on the checksum to ensure that
 	 * we get the right data -- note that we while loading, we are
 	 * only reading the MOS, which is always checksummed).
 	 */
 	if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE)
 		return (B_FALSE);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!range_tree_is_empty(rt))
 		dirty = range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Returns B_TRUE if vdev determines offset needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_start - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	range_seg_t *rs;
 
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
 	return (rs->rs_end);
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 	ASSERT0(scn->scn_phys.scn_errors);
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_txg == 0 ||
 	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	/*
 	 * When a resilver is initiated the scan will assign the scn_max_txg
 	 * value to the highest txg value that exists in all DTLs. If this
 	 * device's max DTL is not part of this scan (i.e. it is not in
 	 * the range (scn_min_txg, scn_max_txg] then it is not eligible
 	 * for excision.
 	 */
 	if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 		ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
 		ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
 		ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If we've completed a scan cleanly then determine
 		 * if this vdev should remove any DTLs. We only want to
 		 * excise regions on vdevs that were available during
 		 * the entire duration of this scan.
 		 */
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started ||
 		    (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
 		    vdev_dtl_should_excise(vd)) {
 			/*
 			 * We completed a scrub up to scrub_txg.  If we
 			 * did it without rebooting, then the scrub dtl
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 		}
 		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
 		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 		if (!vdev_readable(vd))
 			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		else
 			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 
 		/*
 		 * If the vdev was resilvering and no longer has any
 		 * DTLs then reset its resilvering flag.
 		 */
 		if (vd->vdev_resilver_txg != 0 &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE]))
 			vd->vdev_resilver_txg = 0;
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
 	}
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		/* account for child's outage in parent's missing map */
 		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 		if (t == DTL_SCRUB)
 			continue;			/* leaf vdevs only */
 		if (t == DTL_PARTIAL)
 			minref = 1;			/* i.e. non-zero */
 		else if (vd->vdev_nparity != 0)
 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
 		else
 			minref = vd->vdev_children;	/* any kind of mirror */
 		space_reftree_create(&reftree);
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			mutex_enter(&cvd->vdev_dtl_lock);
 			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
 			mutex_exit(&cvd->vdev_dtl_lock);
 		}
 		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
 		space_reftree_destroy(&reftree);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		mutex_enter(&vd->vdev_dtl_lock);
 		error = space_map_load(vd->vdev_dtl_sm,
 		    vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = range_tree_create(NULL, NULL);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	range_tree_walk(rt, range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be offlined/detached/removed
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
 
 	if (!required && zio_injection_enabled)
 		required = !!zio_handle_device_injection(vd, NULL, ECHILD);
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.
  * Returns the spacemap object, or 0 if it wasn't in the ZAP
  * or the ZAP doesn't exist yet.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 	if (vd->vdev_top_zap == 0) {
 		return (0);
 	}
 
 	uint64_t sm_obj = 0;
 	int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj);
 
 	ASSERT(err == 0 || err == ENOENT);
 
 	return (sm_obj);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int error = 0;
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_load(vd->vdev_child[c]);
 		if (error != 0) {
 			return (error);
 		}
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		if (zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str) == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		}
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd);
 		if (checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd);
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
 		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect vdev.
 	 */
 	if (!vdev_is_concrete(vd))
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 
 		/*
 		 * When reopening in reponse to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *tvd = vd->vdev_top;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 		/*
 		 * Report intializing progress. Since we don't have the
 		 * initializing locks held, this is only an estimate (although a
 		 * fairly accurate one).
 		 */
 		vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
 		vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
 		vs->vs_initialize_state = vd->vdev_initialize_state;
 		vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
 	}
 	/*
 	 * Report expandable space on top-level, non-auxillary devices only.
 	 * The expandable space is reported in terms of metaslab sized units
 	 * since that determines how much space the pool can expand.
 	 */
 	if (vd->vdev_aux == NULL && tvd != NULL) {
 		vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize -
 		    spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
 	}
 	if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 	    vdev_is_concrete(vd)) {
 		vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 		    vd->vdev_mg->mg_fragmentation : 0;
 	}
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (vd == rvd) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *cvd = rvd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 
 			for (int t = 0; t < ZIO_TYPES; t++) {
 				vs->vs_ops[t] += cvs->vs_ops[t];
 				vs->vs_bytes[t] += cvs->vs_bytes[t];
 			}
 			cvs->vs_scan_removing = cvd->vdev_removing;
 		}
 	}
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 	vdev_stat_t *vs = &vd->vdev_stat;
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_phys_t *scn_phys =
 				    &spa->spa_dsl_pool->dp_scan->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				/* XXX cleanup? */
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		vs->vs_ops[type]++;
 		vs->vs_bytes[type] += psize;
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	mutex_enter(&vd->vdev_stat_lock);
 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
 	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	if (spa->spa_load_state == SPA_LOAD_NONE &&
 	    type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * childrens', thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * If we have brought this vdev back into service, we need
 	 * to notify fmd so that it can gracefully repair any outstanding
 	 * cases due to a missing device.  We do this in all cases, even those
 	 * that probably don't correlate to a repaired fault.  This is sure to
 	 * catch all cases, and we let the zfs-retire agent sort it out.  If
 	 * this is a transient state it's OK, as the retire agent will
 	 * double-check the state of the vdev before repairing it.
 	 */
 	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
 	    vd->vdev_prevstate != state)
 		zfs_post_state_change(spa, vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  * In addition, only a single top-level vdev is allowed.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
 		    vd->vdev_children > 1) {
 			return (B_FALSE);
 		} else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
 		    strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
 			return (B_FALSE);
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY(vdev_metaslab_init(vd, txg) == 0);
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (avl_numnodes(&vq->vq_active_tree) > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime we panic the system.
 			 */
 			fio = avl_first(&vq->vq_active_tree);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa)) {
 				vdev_dbgmsg(vd, "SLOW IO: zio timestamp "
 				    "%lluns, delta %lluns, last io %lluns",
 				    fio->io_timestamp, (u_longlong_t)delta,
 				    vq->vq_io_complete_ts);
 				fm_panic("I/O to pool '%s' appears to be "
 				    "hung.", spa_name(spa));
 			}
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	(revision 354953)
@@ -1,294 +1,294 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
 
 /*
  * Virtual device vector for files.
  */
 
 static void
 vdev_file_hold(vdev_t *vd)
 {
 	ASSERT(vd->vdev_path != NULL);
 }
 
 static void
 vdev_file_rele(vdev_t *vd)
 {
 	ASSERT(vd->vdev_path != NULL);
 }
 
 static int
 vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
     uint64_t *ashift)
 {
 	vdev_file_t *vf;
 	vnode_t *vp;
 	vattr_t vattr;
 	int error;
 
 	/*
 	 * We must have a pathname, and it must be absolute.
 	 */
 	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Reopen the device if it's not currently open.  Otherwise,
 	 * just update the physical size of the device.
 	 */
 	if (vd->vdev_tsd != NULL) {
 		ASSERT(vd->vdev_reopening);
 		vf = vd->vdev_tsd;
 		goto skip_open;
 	}
 
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
 	/*
 	 * We always open the files from the root of the global zone, even if
 	 * we're in a local zone.  If the user has gotten to this point, the
 	 * administrator has already decided that the pool should be available
 	 * to local zone users, so the underlying devices should be as well.
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
 	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	vf->vf_vnode = vp;
 
 #ifdef _KERNEL
 	/*
 	 * Make sure it's a regular file.
 	 */
 	if (vp->v_type != VREG) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (SET_ERROR(ENODEV));
 	}
 #endif
 
 skip_open:
 	/*
 	 * Determine the physical size of the file.
 	 */
 	vattr.va_mask = AT_SIZE;
 	error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
 		return (error);
 	}
 
 	*max_psize = *psize = vattr.va_size;
 	*ashift = SPA_MINBLOCKSHIFT;
 
 	return (0);
 }
 
 static void
 vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
 
 	if (vd->vdev_reopening || vf == NULL)
 		return;
 
 	if (vf->vf_vnode != NULL) {
 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
 		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
 		    kcred, NULL);
 		VN_RELE(vf->vf_vnode);
 	}
 
 	vd->vdev_delayed_close = B_FALSE;
 	kmem_free(vf, sizeof (vdev_file_t));
 	vd->vdev_tsd = NULL;
 }
 
 /*
  * Implements the interrupt side for file vdev types. This routine will be
  * called when the I/O completes allowing us to transfer the I/O to the
  * interrupt taskqs. For consistency, the code structure mimics disk vdev
  * types.
  */
 static int
 vdev_file_io_intr(buf_t *bp)
 {
 	vdev_buf_t *vb = (vdev_buf_t *)bp;
 	zio_t *zio = vb->vb_io;
 
 	zio->io_error = (geterror(bp) != 0 ? EIO : 0);
 	if (zio->io_error == 0 && bp->b_resid != 0)
 		zio->io_error = SET_ERROR(ENOSPC);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 	} else {
 		abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
 	}
 
 	kmem_free(vb, sizeof (vdev_buf_t));
 	zio_delay_interrupt(zio);
 	return (0);
 }
 
 static void
 vdev_file_io_strategy(void *arg)
 {
 	buf_t *bp = arg;
 	vnode_t *vp = bp->b_private;
 	ssize_t resid;
 	int error;
 
 	error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE,
 	    vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno),
 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 
 	if (error == 0) {
 		bp->b_resid = resid;
 		biodone(bp);
 	} else {
 		bioerror(bp, error);
 		biodone(bp);
 	}
 }
 
 static void
 vdev_file_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_file_t *vf = vd->vdev_tsd;
 	vdev_buf_t *vb;
 	buf_t *bp;
 
 	if (zio->io_type == ZIO_TYPE_IOCTL) {
 		/* XXPOLICY */
 		if (!vdev_readable(vd)) {
 			zio->io_error = SET_ERROR(ENXIO);
 			zio_interrupt(zio);
 			return;
 		}
 
 		switch (zio->io_cmd) {
 		case DKIOCFLUSHWRITECACHE:
 			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
 			    kcred, NULL);
 			break;
 		default:
 			zio->io_error = SET_ERROR(ENOTSUP);
 		}
 
 		zio_execute(zio);
 		return;
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
 
 	vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
 
 	vb->vb_io = zio;
 	bp = &vb->vb_buf;
 
 	bioinit(bp);
 	bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
 	bp->b_bcount = zio->io_size;
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		bp->b_un.b_addr =
 		    abd_borrow_buf(zio->io_abd, zio->io_size);
 	} else {
 		bp->b_un.b_addr =
 		    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
 	}
 
 	bp->b_lblkno = lbtodb(zio->io_offset);
 	bp->b_bufsize = zio->io_size;
 	bp->b_private = vf->vf_vnode;
 	bp->b_iodone = vdev_file_io_intr;
 
 	VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp,
-	    TQ_SLEEP), !=, 0);
+	    TQ_SLEEP), !=, TASKQID_INVALID);
 }
 
 /* ARGSUSED */
 static void
 vdev_file_io_done(zio_t *zio)
 {
 }
 
 vdev_ops_t vdev_file_ops = {
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 /*
  * From userland we access disks just like files.
  */
 #ifndef _KERNEL
 
 vdev_ops_t vdev_disk_ops = {
 	.vdev_op_open = vdev_file_open,
 	.vdev_op_close = vdev_file_close,
 	.vdev_op_asize = vdev_default_asize,
 	.vdev_op_io_start = vdev_file_io_start,
 	.vdev_op_io_done = vdev_file_io_done,
 	.vdev_op_state_change = NULL,
 	.vdev_op_need_resilver = NULL,
 	.vdev_op_hold = vdev_file_hold,
 	.vdev_op_rele = vdev_file_rele,
 	.vdev_op_remap = NULL,
 	.vdev_op_xlate = vdev_default_xlate,
 	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
 	.vdev_op_leaf = B_TRUE			/* leaf vdev */
 };
 
 #endif
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	(revision 354953)
@@ -1,3417 +1,3418 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/abd.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantianted (e.g. if the dataset is a normal fileystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  *	- a single, per-dataset, ZIL header; which points to a chain of
  *	- zero or more ZIL blocks; each of which contains
  *	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 int zfs_commit_timeout_pct = 5;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 boolean_t zil_nocacheflush = B_FALSE;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 uint64_t zil_slog_bulk = 768 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
 
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
     sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = AVL_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (AVL_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
     char **end)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = abuf->b_data;
 			char *lr = (char *)(zilc + 1);
 			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, len);
 				*end = (char *)dst + len;
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = abuf->b_data;
 			uint64_t size = BP_GET_LSIZE(bp);
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				ASSERT3U(zilc->zc_nused, <=,
 				    SPA_OLD_MAXBLOCKSIZE);
 				bcopy(lr, dst, zilc->zc_nused);
 				*end = (char *)dst + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk;
 	char *lrbuf, *lrp;
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *end;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
 		if (error != 0)
 			break;
 
 		for (lrp = lrbuf; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
 			if (lr->lrc_seq > claim_lr_seq)
 				goto done;
 			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
 				goto done;
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
 	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
 	zil_bp_tree_fini(zilog);
 	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (bp->blk_birth >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	if (lrc->lrc_txtype != TX_WRITE)
 		return (0);
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg &&
 	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
 		return (error);
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
 static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
 	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp))
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (AVL_CMP(v1, v2));
 }
 
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
 	lwb->lwb_slog = slog;
 	lwb->lwb_state = LWB_STATE_CLOSED;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_tx = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 		lwb->lwb_nused = sizeof (zil_chain_t);
 		lwb->lwb_sz = BP_GET_LSIZE(bp);
 	} else {
 		lwb->lwb_nused = 0;
 		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 boolean_t
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianess
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa,
 		    zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL,
 		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block poiner into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 
 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  */
 void
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg);
 		}
 		BP_ZERO(&zh->zh_log);
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 /* ARGSUSED */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa));
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_lock, thus it must be held when calling this function.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
 	    lwb->lwb_state == LWB_STATE_ISSUED ||
 	    lwb->lwb_state == LWB_STATE_WRITE_DONE);
 
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	zcw->zcw_lwb = lwb;
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	list_insert_tail(nolwb, zcw);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	dmu_tx_t *tx = lwb->lwb_tx;
 	zil_commit_waiter_t *zcw;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 
 	mutex_enter(&zilog->zl_lock);
 
 	/*
 	 * Ensure the lwb buffer pointer is cleared before releasing the
 	 * txg. If we have had an allocation failure and the txg is
 	 * waiting to sync then we want zil_sync() to remove the lwb so
 	 * that it's not picked up as the next new one in
 	 * zil_process_commit_list(). zil_sync() will only remove the
 	 * lwb if lwb_buf is null.
 	 */
 	lwb->lwb_buf = NULL;
 	lwb->lwb_tx = NULL;
 
 	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
 	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT(list_link_active(&zcw->zcw_node));
 		list_remove(&lwb->lwb_waiters, zcw);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * Now that we've written this log block, we have a stable pointer
 	 * to the next block in the chain, so it's OK to let the txg in
 	 * which we allocated the next block sync.
 	 */
 	dmu_tx_commit(tx);
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
 	ASSERT(!BP_IS_GANG(zio->io_bp));
 	ASSERT(!BP_IS_HOLE(zio->io_bp));
 	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
 	abd_put(zio->io_abd);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_write_zio = NULL;
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL)
 			zio_flush(lwb->lwb_root_zio, vd);
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	/*
 	 * The zilog's "zl_last_lwb_opened" field is used to build the
 	 * lwb/zio dependency chain, which is used to preserve the
 	 * ordering of lwb completions that is required by the semantics
 	 * of the ZIL. Each new lwb zio becomes a parent of the
 	 * "previous" lwb zio, such that the new lwb's zio cannot
 	 * complete until the "previous" lwb's zio completes.
 	 *
 	 * This is required by the semantics of zil_commit(); the commit
 	 * waiters attached to the lwbs will be woken in the lwb zio's
 	 * completion callback, so this zio dependency graph ensures the
 	 * waiters are woken in the correct order (the same order the
 	 * lwbs were created).
 	 */
 	if (last_lwb_opened != NULL &&
 	    last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
 		ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
 		    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
 		    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
 
 		ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
 		zio_add_child(lwb->lwb_root_zio,
 		    last_lwb_opened->lwb_root_zio);
 
 		/*
 		 * If the previous lwb's write hasn't already completed,
 		 * we also want to order the completion of the lwb write
 		 * zios (above, we only order the completion of the lwb
 		 * root zios). This is required because of how we can
 		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 		 *
 		 * When the DKIOCFLUSHWRITECACHE commands are defered,
 		 * the previous lwb will rely on this lwb to flush the
 		 * vdevs written to by that previous lwb. Thus, we need
 		 * to ensure this lwb doesn't issue the flush until
 		 * after the previous lwb's write completes. We ensure
 		 * this ordering by setting the zio parent/child
 		 * relationship here.
 		 *
 		 * Without this relationship on the lwb's write zio,
 		 * it's possible for this lwb's write to complete prior
 		 * to the previous lwb's write completing; and thus, the
 		 * vdevs for the previous lwb would be flushed prior to
 		 * that lwb's data being written to those vdevs (the
 		 * vdevs are flushed in the lwb write zio's completion
 		 * handler, zil_lwb_write_done()).
 		 */
 		if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
 			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
 			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
 
 			ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
 			zio_add_child(lwb->lwb_write_zio,
 			    last_lwb_opened->lwb_write_zio);
 		}
 	}
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. To do this, the lwb's zio
  * structures are created, and linked to the lwb. This function is
  * idempotent; if the passed in lwb has already been opened, this
  * function is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
 	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	if (lwb->lwb_root_zio == NULL) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 
 		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 
 		lwb->lwb_root_zio = zio_root(zilog->zl_spa,
 		    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
 		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
 
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
 		    zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
 		    BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
 		    prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
 
 		lwb->lwb_state = LWB_STATE_OPENED;
 
 		mutex_enter(&zilog->zl_lock);
 		zil_lwb_set_zio_dependency(zilog, lwb);
 		zilog->zl_last_lwb_opened = lwb;
 		mutex_exit(&zilog->zl_lock);
 	}
 
 	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
 	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 uint64_t zil_block_buckets[] = {
     4096,		/* non TX_WRITE */
     8192+4096,		/* data base */
     32*1024 + 4096,	/* NFS writes */
     UINT64_MAX
 };
 
 /*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
 static lwb_t *
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	lwb_t *nlwb = NULL;
 	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
 	boolean_t slog;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
 	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 		bp = &zilc->zc_next_blk;
 	} else {
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
 		bp = &zilc->zc_next_blk;
 	}
 
 	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
 	/*
 	 * Allocate the next block and save its address in this block
 	 * before writing it in order to establish the log chain.
 	 * Note that if the allocation of nlwb synced before we wrote
 	 * the block that points at it (lwb), we'd leak it if we crashed.
 	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
 	 * We dirty the dataset to ensure that zil_sync() will be called
 	 * to clean up in the event of allocation failure or I/O failure.
 	 */
 
 	tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	lwb->lwb_tx = tx;
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
 		continue;
 	zil_blksz = zil_block_buckets[i];
 	if (zil_blksz == UINT64_MAX)
 		zil_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
 
 	/* pass the old blkptr in order to spread log blocks across devs */
 	error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object,
 	    txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		/*
 		 * Allocate a new log write block (lwb).
 		 */
 		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 	}
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		/* For Slim ZIL only write what is used. */
 		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
 		ASSERT3U(wsz, <=, lwb->lwb_sz);
 		zio_shrink(lwb->lwb_write_zio, wsz);
 
 	} else {
 		wsz = lwb->lwb_sz;
 	}
 
 	zilc->zc_pad = 0;
 	zilc->zc_nused = lwb->lwb_nused;
 	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
 	/*
 	 * clear unused data for security
 	 */
 	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
 
 	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
 
 	zil_lwb_add_block(lwb, &lwb->lwb_blk);
 	lwb->lwb_issued_timestamp = gethrtime();
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	zio_nowait(lwb->lwb_root_zio);
 	zio_nowait(lwb->lwb_write_zio);
 
 	/*
 	 * If there was an allocation failure then nlwb will be null which
 	 * forces a txg_wait_synced().
 	 */
 	return (nlwb);
 }
 
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrcb, *lrc;
 	lr_write_t *lrwb, *lrw;
 	char *lr_buf;
 	uint64_t dlen, dnow, lwb_sp, reclen, txg;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lrc = &itx->itx_lr;
 	lrw = (lr_write_t *)lrc;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lrc->lrc_txtype == TX_COMMIT) {
 		mutex_enter(&zilog->zl_lock);
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		itx->itx_private = NULL;
 		mutex_exit(&zilog->zl_lock);
 		return (lwb);
 	}
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lrc->lrc_reclen;
 	zilog->zl_cur_used += (reclen + dlen);
 	txg = lrc->lrc_txg;
 
 	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 ||
 	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
 		lwb = zil_lwb_write_issue(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_open(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
 		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
 		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
 	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
 	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lrc->lrc_txtype == TX_WRITE) {
 		if (txg > spa_freeze_txg(zilog->zl_spa))
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 		if (itx->itx_wr_state != WR_COPIED) {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrcb->lrc_reclen += dnow;
 				if (lrwb->lr_length > dnow)
 					lrwb->lr_length = dnow;
 				lrw->lr_offset += dnow;
 				lrw->lr_length -= dnow;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 
 			/*
 			 * We pass in the "lwb_write_zio" rather than
 			 * "lwb_root_zio" so that the "lwb_write_zio"
 			 * becomes the parent of any zio's created by
 			 * the "zl_get_data" callback. The vdevs are
 			 * flushed after the "lwb_write_zio" completes,
 			 * so we want to make sure that completion
 			 * callback waits for these additional zio's,
 			 * such that the vdevs used by those zio's will
 			 * be included in the lwb's vdev tree, and those
 			 * vdevs will be properly flushed. If we passed
 			 * in "lwb_root_zio" here, then these additional
 			 * vdevs may not be flushed; e.g. if these zio's
 			 * completed after "lwb_write_zio" completed.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    lrwb, dbuf, lwb, lwb->lwb_write_zio);
 
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
 			}
 			if (error != 0) {
 				ASSERT(error == ENOENT || error == EEXIST ||
 				    error == EALREADY);
 				return (lwb);
 			}
 		}
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	lrcb->lrc_seq = ++zilog->zl_lr_seq;
 	lwb->lwb_nused += reclen + dnow;
 
 	zil_lwb_add_txg(lwb, txg);
 
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	dlen -= dnow;
 	if (dlen > 0) {
 		zilog->zl_cur_used += reclen;
 		goto cont;
 	}
 
 	return (lwb);
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t lrsize)
 {
 	itx_t *itx;
 
 	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
 
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(itxs_t *itxs)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_commit(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		list_remove(list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_head(list)) != NULL) {
 			list_remove(list, itx);
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (AVL_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 static void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian = avl_find(t, &oid, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_head(&clean_list)) != NULL) {
 		list_remove(&clean_list, itx);
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Object ids can be re-instantiated in the next txg so
 	 * remove any async transactions to avoid future leaks.
 	 * This can happen if a fsync occurs on the re-instantiated
 	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
 	 * the new file data and flushes a write record for the old object.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE)
 		zil_remove_async(zilog, itx->itx_oid);
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been comitted) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	if (taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
-	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == NULL)
+	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) ==
+	    TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static void
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 static void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian = avl_find(t, &foid, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while (itx = list_head(&zilog->zl_itx_commit_list)) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 			itx->itx_private = NULL;
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_waiters;
 	lwb_t *lwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_head(&zilog->zl_itx_commit_list) == NULL)
 		return;
 
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 	}
 
 	while (itx = list_head(&zilog->zl_itx_commit_list)) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. we commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. we move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occuring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_commit(zilog, itx, lwb);
 			} else if (lrc->lrc_txtype == TX_COMMIT) {
 				ASSERT3P(lwb, ==, NULL);
 				zil_commit_waiter_link_nolwb(
 				    itx->itx_private, &nolwb_waiters);
 			}
 		}
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while (zcw = list_head(&nolwb_waiters)) {
 			zil_commit_waiter_skip(zcw);
 			list_remove(&nolwb_waiters, zcw);
 		}
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in one of the following states: "closed"
 		 * or "open".
 		 *
 		 * If its "closed", then no itxs have been committed to
 		 * it, so there's no point in issuing its zio (i.e.
 		 * it's "empty").
 		 *
 		 * If its "open" state, then it contains one or more
 		 * itxs that eventually need to be committed to stable
 		 * storage. In this case we intentionally do not issue
 		 * the lwb's zio to disk yet, and instead rely on one of
 		 * the following two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occuring
 		 * on the system, such that this function will be
 		 * immediately called again (not necessarily by the same
 		 * thread) and this lwb's zio will be issued via
 		 * zil_lwb_commit(). This way, the lwb is guaranteed to
 		 * be "full" when it is issued to disk, and we'll make
 		 * use of the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occuring on
 		 * the system, such that this lwb's zio isn't issued via
 		 * zil_lwb_commit(), zil_commit_waiter() will issue the
 		 * lwb's zio. If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occuring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in it's buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been commited to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static void
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state == LWB_STATE_ISSUED ||
 	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_issue() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are aquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done)
 		goto out;
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from ISSUED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's ISSUED or OPENED, and block any other threads that might
 	 * attempt to issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_issue() if the lwb had already been issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state == LWB_STATE_ISSUED ||
 	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		goto out;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
 
 	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
 
 	/*
 	 * Since the lwb's zio hadn't been issued by the time this thread
 	 * reached its timeout, we reset the zilog's "zl_cur_used" field
 	 * to influence the zil block size selection algorithm.
 	 *
 	 * By having to issue the lwb's zio here, it means the size of the
 	 * lwb was too large, given the incoming throughput of itxs.  By
 	 * setting "zl_cur_used" to zero, we communicate this fact to the
 	 * block size selection algorithm, so it can take this informaiton
 	 * into account, and potentially select a smaller size for the
 	 * next lwb block that is allocated.
 	 */
 	zilog->zl_cur_used = 0;
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_issue() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 *
 		 * We must drop the commit waiter's lock prior to
 		 * calling zil_commit_writer_stall() or else we can wind
 		 * up with the following deadlock:
 		 *
 		 * - This thread is waiting for the txg to sync while
 		 *   holding the waiter's lock; txg_wait_synced() is
 		 *   used within txg_commit_writer_stall().
 		 *
 		 * - The txg can't sync because it is waiting for this
 		 *   lwb's zio callback to call dmu_tx_commit().
 		 *
 		 * - The lwb's zio callback can't call dmu_tx_commit()
 		 *   because it's blocked trying to acquire the waiter's
 		 *   lock, which occurs prior to calling dmu_tx_commit()
 		 */
 		mutex_exit(&zcw->zcw_lock);
 		zil_commit_writer_stall(zilog);
 		mutex_enter(&zcw->zcw_lock);
 	}
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			clock_t timeleft = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (timeleft >= 0 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signalled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter()
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signalled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit it is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signalled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signalled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signalled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child depenency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the sematics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 
 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
 
 		bzero(zh, sizeof (zil_header_t));
 		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_head(&zilog->zl_lwb_list) == NULL)
 			BP_ZERO(&zh->zh_log);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 /* ARGSUSED */
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL)
 		txg = zilog->zl_dirty_max_txg;
 	else
 		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait long enough for the
 	 * ZIL to be clean, and to wait for all pending lwbs to be
 	 * written out.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
 		list_remove(&zilog->zl_lwb_list, lwb);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED and LWB_STATE_ISSUED lwb's to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t **zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	bcopy(lr, zr->zr_lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
 zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		zil_destroy(zilog, B_TRUE);
 		return;
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg);
 	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /* ARGSUSED */
 int
 zil_reset(const char *osname, void *arg)
 {
 	int error;
 
 	error = zil_suspend(osname, NULL);
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
Index: vendor-sys/illumos/dist/uts/common/sys/taskq.h
===================================================================
--- vendor-sys/illumos/dist/uts/common/sys/taskq.h	(revision 354952)
+++ vendor-sys/illumos/dist/uts/common/sys/taskq.h	(revision 354953)
@@ -1,93 +1,95 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_TASKQ_H
 #define	_SYS_TASKQ_H
 
 #include <sys/types.h>
 #include <sys/thread.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	TASKQ_NAMELEN	31
 
 typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 struct proc;
 
 /*
  * Public flags for taskq_create(): bit range 0-15
  */
 #define	TASKQ_PREPOPULATE	0x0001	/* Prepopulate with threads and data */
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* number of threads as % of ncpu */
 #define	TASKQ_DC_BATCH		0x0010	/* Taskq uses SDC in batch mode */
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
  * KM_SLEEP/KM_NOSLEEP.
  */
 #define	TQ_SLEEP	0x00	/* Can block for memory */
 #define	TQ_NOSLEEP	0x01	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
 #define	TQ_NOALLOC	0x04	/* cannot allocate memory; may fail */
 #define	TQ_FRONT	0x08	/* Put task at the front of the queue */
 
+#define	TASKQID_INVALID	((taskqid_t)0)
+
 #ifdef _KERNEL
 
 extern taskq_t *system_taskq;
 
 extern void	taskq_init(void);
 extern void	taskq_mp_init(void);
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int,
     int, uint_t);
 extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
 extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
     struct proc *, uint_t, uint_t);
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	nulltask(void *);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
 void	taskq_wait_id(taskq_t *, taskqid_t);
 extern void	taskq_suspend(taskq_t *);
 extern int	taskq_suspended(taskq_t *);
 extern void	taskq_resume(taskq_t *);
 extern int	taskq_member(taskq_t *, kthread_t *);
 
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_TASKQ_H */