Index: head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c	(revision 329627)
+++ head/sys/cddl/contrib/opensolaris/common/nvpair/opensolaris_nvpair.c	(revision 329628)
@@ -1,3307 +1,3307 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/debug.h>
 #include <sys/nvpair.h>
 #include <sys/nvpair_impl.h>
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 
 #if defined(_KERNEL) && !defined(_BOOT)
 #include <sys/varargs.h>
 #include <sys/sunddi.h>
 #else
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
 #endif
 
 #ifndef	offsetof
 #define	offsetof(s, m)		((size_t)(&(((s *)0)->m)))
 #endif
 #define	skip_whitespace(p)	while ((*(p) == ' ') || (*(p) == '\t')) p++
 
 #if defined(__FreeBSD__) && !defined(_KERNEL)
 /*
  * libnvpair is the lowest commen denominator for ZFS related libraries,
  * defining aok here makes it usable by all ZFS related libraries
  */
 int aok;
 #endif
 
 /*
  * nvpair.c - Provides kernel & userland interfaces for manipulating
  *	name-value pairs.
  *
  * Overview Diagram
  *
  *  +--------------+
  *  |  nvlist_t    |
  *  |--------------|
  *  | nvl_version  |
  *  | nvl_nvflag   |
  *  | nvl_priv    -+-+
  *  | nvl_flag     | |
  *  | nvl_pad      | |
  *  +--------------+ |
  *                   V
  *      +--------------+      last i_nvp in list
  *      | nvpriv_t     |  +--------------------->
  *      |--------------|  |
  *   +--+- nvp_list    |  |   +------------+
  *   |  |  nvp_last   -+--+   + nv_alloc_t |
  *   |  |  nvp_curr    |      |------------|
  *   |  |  nvp_nva    -+----> | nva_ops    |
  *   |  |  nvp_stat    |      | nva_arg    |
  *   |  +--------------+      +------------+
  *   |
  *   +-------+
  *           V
  *   +---------------------+      +-------------------+
  *   |  i_nvp_t            |  +-->|  i_nvp_t          |  +-->
  *   |---------------------|  |   |-------------------|  |
  *   | nvi_next           -+--+   | nvi_next         -+--+
  *   | nvi_prev (NULL)     | <----+ nvi_prev          |
  *   | . . . . . . . . . . |      | . . . . . . . . . |
  *   | nvp (nvpair_t)      |      | nvp (nvpair_t)    |
  *   |  - nvp_size         |      |  - nvp_size       |
  *   |  - nvp_name_sz      |      |  - nvp_name_sz    |
  *   |  - nvp_value_elem   |      |  - nvp_value_elem |
  *   |  - nvp_type         |      |  - nvp_type       |
  *   |  - data ...         |      |  - data ...       |
  *   +---------------------+      +-------------------+
  *
  *
  *
  *   +---------------------+              +---------------------+
  *   |  i_nvp_t            |  +-->    +-->|  i_nvp_t (last)     |
  *   |---------------------|  |       |   |---------------------|
  *   |  nvi_next          -+--+ ... --+   | nvi_next (NULL)     |
  * <-+- nvi_prev           |<-- ...  <----+ nvi_prev            |
  *   | . . . . . . . . .   |              | . . . . . . . . .   |
  *   | nvp (nvpair_t)      |              | nvp (nvpair_t)      |
  *   |  - nvp_size         |              |  - nvp_size         |
  *   |  - nvp_name_sz      |              |  - nvp_name_sz      |
  *   |  - nvp_value_elem   |              |  - nvp_value_elem   |
  *   |  - DATA_TYPE_NVLIST |              |  - nvp_type         |
  *   |  - data (embedded)  |              |  - data ...         |
  *   |    nvlist name      |              +---------------------+
  *   |  +--------------+   |
  *   |  |  nvlist_t    |   |
  *   |  |--------------|   |
  *   |  | nvl_version  |   |
  *   |  | nvl_nvflag   |   |
  *   |  | nvl_priv   --+---+---->
  *   |  | nvl_flag     |   |
  *   |  | nvl_pad      |   |
  *   |  +--------------+   |
  *   +---------------------+
  *
  *
  * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
  * allow value to be aligned on 8 byte boundary
  *
  * name_len is the length of the name string including the null terminator
  * so it must be >= 1
  */
 #define	NVP_SIZE_CALC(name_len, data_len) \
 	(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
 
 static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
 static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
     uint_t nelem, const void *data);
 
 #define	NV_STAT_EMBEDDED	0x1
 #define	EMBEDDED_NVL(nvp)	((nvlist_t *)(void *)NVP_VALUE(nvp))
 #define	EMBEDDED_NVL_ARRAY(nvp)	((nvlist_t **)(void *)NVP_VALUE(nvp))
 
 #define	NVP_VALOFF(nvp)	(NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
 #define	NVPAIR2I_NVP(nvp) \
 	((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
 
 
 int
 nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
 {
 	va_list valist;
 	int err = 0;
 
 	nva->nva_ops = nvo;
 	nva->nva_arg = NULL;
 
 	va_start(valist, nvo);
 	if (nva->nva_ops->nv_ao_init != NULL)
 		err = nva->nva_ops->nv_ao_init(nva, valist);
 	va_end(valist);
 
 	return (err);
 }
 
 void
 nv_alloc_reset(nv_alloc_t *nva)
 {
 	if (nva->nva_ops->nv_ao_reset != NULL)
 		nva->nva_ops->nv_ao_reset(nva);
 }
 
 void
 nv_alloc_fini(nv_alloc_t *nva)
 {
 	if (nva->nva_ops->nv_ao_fini != NULL)
 		nva->nva_ops->nv_ao_fini(nva);
 }
 
 nv_alloc_t *
 nvlist_lookup_nv_alloc(nvlist_t *nvl)
 {
 	nvpriv_t *priv;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	return (priv->nvp_nva);
 }
 
 static void *
 nv_mem_zalloc(nvpriv_t *nvp, size_t size)
 {
 	nv_alloc_t *nva = nvp->nvp_nva;
 	void *buf;
 
 	if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
 		bzero(buf, size);
 
 	return (buf);
 }
 
 static void
 nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
 {
 	nv_alloc_t *nva = nvp->nvp_nva;
 
 	nva->nva_ops->nv_ao_free(nva, buf, size);
 }
 
 static void
 nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
 {
 	bzero(priv, sizeof (nvpriv_t));
 
 	priv->nvp_nva = nva;
 	priv->nvp_stat = stat;
 }
 
 static nvpriv_t *
 nv_priv_alloc(nv_alloc_t *nva)
 {
 	nvpriv_t *priv;
 
 	/*
 	 * nv_mem_alloc() cannot called here because it needs the priv
 	 * argument.
 	 */
 	if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
 		return (NULL);
 
 	nv_priv_init(priv, nva, 0);
 
 	return (priv);
 }
 
 /*
  * Embedded lists need their own nvpriv_t's.  We create a new
  * nvpriv_t using the parameters and allocator from the parent
  * list's nvpriv_t.
  */
 static nvpriv_t *
 nv_priv_alloc_embedded(nvpriv_t *priv)
 {
 	nvpriv_t *emb_priv;
 
 	if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
 		return (NULL);
 
 	nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
 
 	return (emb_priv);
 }
 
 static void
 nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
 {
 	nvl->nvl_version = NV_VERSION;
 	nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
 	nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
 	nvl->nvl_flag = 0;
 	nvl->nvl_pad = 0;
 }
 
 uint_t
 nvlist_nvflag(nvlist_t *nvl)
 {
 	return (nvl->nvl_nvflag);
 }
 
 /*
  * nvlist_alloc - Allocate nvlist.
  */
 /*ARGSUSED1*/
 int
 nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
 {
 #if defined(_KERNEL) && !defined(_BOOT)
 	return (nvlist_xalloc(nvlp, nvflag,
 	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
 #else
 	return (nvlist_xalloc(nvlp, nvflag, nv_alloc_nosleep));
 #endif
 }
 
 int
 nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
 {
 	nvpriv_t *priv;
 
 	if (nvlp == NULL || nva == NULL)
 		return (EINVAL);
 
 	if ((priv = nv_priv_alloc(nva)) == NULL)
 		return (ENOMEM);
 
 	if ((*nvlp = nv_mem_zalloc(priv,
 	    NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
 		nv_mem_free(priv, priv, sizeof (nvpriv_t));
 		return (ENOMEM);
 	}
 
 	nvlist_init(*nvlp, nvflag, priv);
 
 	return (0);
 }
 
 /*
  * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
  */
 static nvpair_t *
 nvp_buf_alloc(nvlist_t *nvl, size_t len)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *buf;
 	nvpair_t *nvp;
 	size_t nvsize;
 
 	/*
 	 * Allocate the buffer
 	 */
 	nvsize = len + offsetof(i_nvp_t, nvi_nvp);
 
 	if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
 		return (NULL);
 
 	nvp = &buf->nvi_nvp;
 	nvp->nvp_size = len;
 
 	return (nvp);
 }
 
 /*
  * nvp_buf_free - de-Allocate an i_nvp_t.
  */
 static void
 nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
 
 	nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
 }
 
 /*
  * nvp_buf_link - link a new nv pair into the nvlist.
  */
 static void
 nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
 
 	/* Put element at end of nvlist */
 	if (priv->nvp_list == NULL) {
 		priv->nvp_list = priv->nvp_last = curr;
 	} else {
 		curr->nvi_prev = priv->nvp_last;
 		priv->nvp_last->nvi_next = curr;
 		priv->nvp_last = curr;
 	}
 }
 
 /*
  * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
  */
 static void
 nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
 
 	/*
 	 * protect nvlist_next_nvpair() against walking on freed memory.
 	 */
 	if (priv->nvp_curr == curr)
 		priv->nvp_curr = curr->nvi_next;
 
 	if (curr == priv->nvp_list)
 		priv->nvp_list = curr->nvi_next;
 	else
 		curr->nvi_prev->nvi_next = curr->nvi_next;
 
 	if (curr == priv->nvp_last)
 		priv->nvp_last = curr->nvi_prev;
 	else
 		curr->nvi_next->nvi_prev = curr->nvi_prev;
 }
 
 /*
  * take a nvpair type and number of elements and make sure the are valid
  */
 static int
 i_validate_type_nelem(data_type_t type, uint_t nelem)
 {
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		if (nelem != 0)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_STRING:
 	case DATA_TYPE_HRTIME:
 	case DATA_TYPE_NVLIST:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		if (nelem != 1)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_BYTE_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 	case DATA_TYPE_STRING_ARRAY:
 	case DATA_TYPE_NVLIST_ARRAY:
 		/* we allow arrays with 0 elements */
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Verify nvp_name_sz and check the name string length.
  */
 static int
 i_validate_nvpair_name(nvpair_t *nvp)
 {
 	if ((nvp->nvp_name_sz <= 0) ||
 	    (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
 		return (EFAULT);
 
 	/* verify the name string, make sure its terminated */
 	if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
 		return (EFAULT);
 
 	return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
 }
 
 static int
 i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
 {
 	switch (type) {
 	case DATA_TYPE_BOOLEAN_VALUE:
 		if (*(boolean_t *)data != B_TRUE &&
 		    *(boolean_t *)data != B_FALSE)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY: {
 		int i;
 
 		for (i = 0; i < nelem; i++)
 			if (((boolean_t *)data)[i] != B_TRUE &&
 			    ((boolean_t *)data)[i] != B_FALSE)
 				return (EINVAL);
 		break;
 	}
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * This function takes a pointer to what should be a nvpair and it's size
  * and then verifies that all the nvpair fields make sense and can be
  * trusted.  This function is used when decoding packed nvpairs.
  */
 static int
 i_validate_nvpair(nvpair_t *nvp)
 {
 	data_type_t type = NVP_TYPE(nvp);
 	int size1, size2;
 
 	/* verify nvp_name_sz, check the name string length */
 	if (i_validate_nvpair_name(nvp) != 0)
 		return (EFAULT);
 
 	if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
 		return (EFAULT);
 
 	/*
 	 * verify nvp_type, nvp_value_elem, and also possibly
 	 * verify string values and get the value size.
 	 */
 	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
 	size1 = nvp->nvp_size - NVP_VALOFF(nvp);
 	if (size2 < 0 || size1 != NV_ALIGN(size2))
 		return (EFAULT);
 
 	return (0);
 }
 
 static int
 nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		nvpair_t *nvp = &curr->nvi_nvp;
 		int err;
 
 		if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
 		    NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
 			return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Frees all memory allocated for an nvpair (like embedded lists) with
  * the exception of the nvpair buffer itself.
  */
 static void
 nvpair_free(nvpair_t *nvp)
 {
 	switch (NVP_TYPE(nvp)) {
 	case DATA_TYPE_NVLIST:
 		nvlist_free(EMBEDDED_NVL(nvp));
 		break;
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 		int i;
 
 		for (i = 0; i < NVP_NELEM(nvp); i++)
 			nvlist_free(nvlp[i]);
 		break;
 	}
 	default:
 		break;
 	}
 }
 
 /*
  * nvlist_free - free an unpacked nvlist
  */
 void
 nvlist_free(nvlist_t *nvl)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return;
 
 	/*
 	 * Unpacked nvlist are linked through i_nvp_t
 	 */
 	curr = priv->nvp_list;
 	while (curr != NULL) {
 		nvpair_t *nvp = &curr->nvi_nvp;
 		curr = curr->nvi_next;
 
 		nvpair_free(nvp);
 		nvp_buf_free(nvl, nvp);
 	}
 
 	if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
 		nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
 	else
 		nvl->nvl_priv = 0;
 
 	nv_mem_free(priv, priv, sizeof (nvpriv_t));
 }
 
 static int
 nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr;
 
 	if (nvp == NULL)
 		return (0);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
 		if (&curr->nvi_nvp == nvp)
 			return (1);
 
 	return (0);
 }
 
 /*
  * Make a copy of nvlist
  */
 /*ARGSUSED1*/
 int
 nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
 {
 #if defined(_KERNEL) && !defined(_BOOT)
 	return (nvlist_xdup(nvl, nvlp,
 	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
 #else
 	return (nvlist_xdup(nvl, nvlp, nv_alloc_nosleep));
 #endif
 }
 
 int
 nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
 {
 	int err;
 	nvlist_t *ret;
 
 	if (nvl == NULL || nvlp == NULL)
 		return (EINVAL);
 
 	if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
 		return (err);
 
 	if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
 		nvlist_free(ret);
 	else
 		*nvlp = ret;
 
 	return (err);
 }
 
 /*
  * Remove all with matching name
  */
 int
 nvlist_remove_all(nvlist_t *nvl, const char *name)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 	int error = ENOENT;
 
 	if (nvl == NULL || name == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	curr = priv->nvp_list;
 	while (curr != NULL) {
 		nvpair_t *nvp = &curr->nvi_nvp;
 
 		curr = curr->nvi_next;
 		if (strcmp(name, NVP_NAME(nvp)) != 0)
 			continue;
 
 		nvp_buf_unlink(nvl, nvp);
 		nvpair_free(nvp);
 		nvp_buf_free(nvl, nvp);
 
 		error = 0;
 	}
 
 	return (error);
 }
 
 /*
  * Remove first one with matching name and type
  */
 int
 nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL || name == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	curr = priv->nvp_list;
 	while (curr != NULL) {
 		nvpair_t *nvp = &curr->nvi_nvp;
 
 		if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type) {
 			nvp_buf_unlink(nvl, nvp);
 			nvpair_free(nvp);
 			nvp_buf_free(nvl, nvp);
 
 			return (0);
 		}
 		curr = curr->nvi_next;
 	}
 
 	return (ENOENT);
 }
 
 int
 nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	if (nvl == NULL || nvp == NULL)
 		return (EINVAL);
 
 	nvp_buf_unlink(nvl, nvp);
 	nvpair_free(nvp);
 	nvp_buf_free(nvl, nvp);
 	return (0);
 }
 
 /*
  * This function calculates the size of an nvpair value.
  *
  * The data argument controls the behavior in case of the data types
  * 	DATA_TYPE_STRING    	and
  *	DATA_TYPE_STRING_ARRAY
  * Is data == NULL then the size of the string(s) is excluded.
  */
 static int
 i_get_value_size(data_type_t type, const void *data, uint_t nelem)
 {
 	uint64_t value_sz;
 
 	if (i_validate_type_nelem(type, nelem) != 0)
 		return (-1);
 
 	/* Calculate required size for holding value */
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		value_sz = 0;
 		break;
 	case DATA_TYPE_BOOLEAN_VALUE:
 		value_sz = sizeof (boolean_t);
 		break;
 	case DATA_TYPE_BYTE:
 		value_sz = sizeof (uchar_t);
 		break;
 	case DATA_TYPE_INT8:
 		value_sz = sizeof (int8_t);
 		break;
 	case DATA_TYPE_UINT8:
 		value_sz = sizeof (uint8_t);
 		break;
 	case DATA_TYPE_INT16:
 		value_sz = sizeof (int16_t);
 		break;
 	case DATA_TYPE_UINT16:
 		value_sz = sizeof (uint16_t);
 		break;
 	case DATA_TYPE_INT32:
 		value_sz = sizeof (int32_t);
 		break;
 	case DATA_TYPE_UINT32:
 		value_sz = sizeof (uint32_t);
 		break;
 	case DATA_TYPE_INT64:
 		value_sz = sizeof (int64_t);
 		break;
 	case DATA_TYPE_UINT64:
 		value_sz = sizeof (uint64_t);
 		break;
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 		value_sz = sizeof (double);
 		break;
 #endif
 	case DATA_TYPE_STRING:
 		if (data == NULL)
 			value_sz = 0;
 		else
 			value_sz = strlen(data) + 1;
 		break;
 	case DATA_TYPE_BOOLEAN_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (boolean_t);
 		break;
 	case DATA_TYPE_BYTE_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uchar_t);
 		break;
 	case DATA_TYPE_INT8_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int8_t);
 		break;
 	case DATA_TYPE_UINT8_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint8_t);
 		break;
 	case DATA_TYPE_INT16_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int16_t);
 		break;
 	case DATA_TYPE_UINT16_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint16_t);
 		break;
 	case DATA_TYPE_INT32_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int32_t);
 		break;
 	case DATA_TYPE_UINT32_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint32_t);
 		break;
 	case DATA_TYPE_INT64_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (int64_t);
 		break;
 	case DATA_TYPE_UINT64_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t);
 		break;
 	case DATA_TYPE_STRING_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t);
 
 		if (data != NULL) {
 			char *const *strs = data;
 			uint_t i;
 
 			/* no alignment requirement for strings */
 			for (i = 0; i < nelem; i++) {
 				if (strs[i] == NULL)
 					return (-1);
 				value_sz += strlen(strs[i]) + 1;
 			}
 		}
 		break;
 	case DATA_TYPE_HRTIME:
 		value_sz = sizeof (hrtime_t);
 		break;
 	case DATA_TYPE_NVLIST:
 		value_sz = NV_ALIGN(sizeof (nvlist_t));
 		break;
 	case DATA_TYPE_NVLIST_ARRAY:
 		value_sz = (uint64_t)nelem * sizeof (uint64_t) +
 		    (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
 		break;
 	default:
 		return (-1);
 	}
 
 	return (value_sz > INT32_MAX ? -1 : (int)value_sz);
 }
 
 static int
 nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
 {
 	nvpriv_t *priv;
 	int err;
 
 	if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
 	    nvl->nvl_priv)) == NULL)
 		return (ENOMEM);
 
 	nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
 
 	if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
 		nvlist_free(emb_nvl);
 		emb_nvl->nvl_priv = 0;
 	}
 
 	return (err);
 }
 
 /*
  * nvlist_add_common - Add new <name,value> pair to nvlist
  */
 static int
 nvlist_add_common(nvlist_t *nvl, const char *name,
     data_type_t type, uint_t nelem, const void *data)
 {
 	nvpair_t *nvp;
 	uint_t i;
 
 	int nvp_sz, name_sz, value_sz;
 	int err = 0;
 
 	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
 		return (EINVAL);
 
 	if (nelem != 0 && data == NULL)
 		return (EINVAL);
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) included.
 	 */
 	if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
 		return (EINVAL);
 
 	if (i_validate_nvpair_value(type, nelem, data) != 0)
 		return (EINVAL);
 
 	/*
 	 * If we're adding an nvlist or nvlist array, ensure that we are not
 	 * adding the input nvlist to itself, which would cause recursion,
 	 * and ensure that no NULL nvlist pointers are present.
 	 */
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		if (data == nvl || data == NULL)
 			return (EINVAL);
 		break;
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **onvlp = (nvlist_t **)data;
 		for (i = 0; i < nelem; i++) {
 			if (onvlp[i] == nvl || onvlp[i] == NULL)
 				return (EINVAL);
 		}
 		break;
 	}
 	default:
 		break;
 	}
 
 	/* calculate sizes of the nvpair elements and the nvpair itself */
 	name_sz = strlen(name) + 1;
 
 	nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
 
 	if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
 		return (ENOMEM);
 
 	ASSERT(nvp->nvp_size == nvp_sz);
 	nvp->nvp_name_sz = name_sz;
 	nvp->nvp_value_elem = nelem;
 	nvp->nvp_type = type;
 	bcopy(name, NVP_NAME(nvp), name_sz);
 
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		break;
 	case DATA_TYPE_STRING_ARRAY: {
 		char *const *strs = data;
 		char *buf = NVP_VALUE(nvp);
 		char **cstrs = (void *)buf;
 
 		/* skip pre-allocated space for pointer array */
 		buf += nelem * sizeof (uint64_t);
 		for (i = 0; i < nelem; i++) {
 			int slen = strlen(strs[i]) + 1;
 			bcopy(strs[i], buf, slen);
 			cstrs[i] = buf;
 			buf += slen;
 		}
 		break;
 	}
 	case DATA_TYPE_NVLIST: {
 		nvlist_t *nnvl = EMBEDDED_NVL(nvp);
 		nvlist_t *onvl = (nvlist_t *)data;
 
 		if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
 			nvp_buf_free(nvl, nvp);
 			return (err);
 		}
 		break;
 	}
 	case DATA_TYPE_NVLIST_ARRAY: {
 		nvlist_t **onvlp = (nvlist_t **)data;
 		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 		nvlist_t *embedded = (nvlist_t *)
 		    ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
 
 		for (i = 0; i < nelem; i++) {
 			if ((err = nvlist_copy_embedded(nvl,
 			    onvlp[i], embedded)) != 0) {
 				/*
 				 * Free any successfully created lists
 				 */
 				nvpair_free(nvp);
 				nvp_buf_free(nvl, nvp);
 				return (err);
 			}
 
 			nvlp[i] = embedded++;
 		}
 		break;
 	}
 	default:
 		bcopy(data, NVP_VALUE(nvp), value_sz);
 	}
 
 	/* if unique name, remove before add */
 	if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
 		(void) nvlist_remove_all(nvl, name);
 	else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
 		(void) nvlist_remove(nvl, name, type);
 
 	nvp_buf_link(nvl, nvp);
 
 	return (0);
 }
 
 int
 nvlist_add_boolean(nvlist_t *nvl, const char *name)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
 }
 
 int
 nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
 }
 
 int
 nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
 }
 
 int
 nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
 }
 
 int
 nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
 }
 
 int
 nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
 }
 
 int
 nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
 }
 
 int
 nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
 }
 
 int
 nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
 }
 
 int
 nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
 }
 
 int
 nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
 }
 
 #if !defined(_KERNEL)
 int
 nvlist_add_double(nvlist_t *nvl, const char *name, double val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
 }
 #endif
 
 int
 nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
 }
 
 int
 nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
     boolean_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
 }
 
 int
 nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
 }
 
 int
 nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
 }
 
 int
 nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
 }
 
 int
 nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
 }
 
 int
 nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
 }
 
 int
 nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
 }
 
 int
 nvlist_add_string_array(nvlist_t *nvl, const char *name,
     char *const *a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
 }
 
 int
 nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
 }
 
 int
 nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
 }
 
 int
 nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
 {
 	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
 }
 
 /* reading name-value pairs */
 nvpair_t *
 nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	curr = NVPAIR2I_NVP(nvp);
 
 	/*
 	 * Ensure that nvp is a valid nvpair on this nvlist.
 	 * NB: nvp_curr is used only as a hint so that we don't always
 	 * have to walk the list to determine if nvp is still on the list.
 	 */
 	if (nvp == NULL)
 		curr = priv->nvp_list;
 	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
 		curr = curr->nvi_next;
 	else
 		curr = NULL;
 
 	priv->nvp_curr = curr;
 
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
 nvpair_t *
 nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	nvpriv_t *priv;
 	i_nvp_t *curr;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (NULL);
 
 	curr = NVPAIR2I_NVP(nvp);
 
 	if (nvp == NULL)
 		curr = priv->nvp_last;
 	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
 		curr = curr->nvi_prev;
 	else
 		curr = NULL;
 
 	priv->nvp_curr = curr;
 
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
 boolean_t
 nvlist_empty(nvlist_t *nvl)
 {
 	nvpriv_t *priv;
 
 	if (nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (B_TRUE);
 
 	return (priv->nvp_list == NULL);
 }
 
 char *
 nvpair_name(nvpair_t *nvp)
 {
 	return (NVP_NAME(nvp));
 }
 
 data_type_t
 nvpair_type(nvpair_t *nvp)
 {
 	return (NVP_TYPE(nvp));
 }
 
 int
 nvpair_type_is_array(nvpair_t *nvp)
 {
 	data_type_t type = NVP_TYPE(nvp);
 
 	if ((type == DATA_TYPE_BYTE_ARRAY) ||
 	    (type == DATA_TYPE_INT8_ARRAY) ||
 	    (type == DATA_TYPE_UINT8_ARRAY) ||
 	    (type == DATA_TYPE_INT16_ARRAY) ||
 	    (type == DATA_TYPE_UINT16_ARRAY) ||
 	    (type == DATA_TYPE_INT32_ARRAY) ||
 	    (type == DATA_TYPE_UINT32_ARRAY) ||
 	    (type == DATA_TYPE_INT64_ARRAY) ||
 	    (type == DATA_TYPE_UINT64_ARRAY) ||
 	    (type == DATA_TYPE_BOOLEAN_ARRAY) ||
 	    (type == DATA_TYPE_STRING_ARRAY) ||
 	    (type == DATA_TYPE_NVLIST_ARRAY))
 		return (1);
 	return (0);
 
 }
 
 static int
 nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
 {
 	if (nvp == NULL || nvpair_type(nvp) != type)
 		return (EINVAL);
 
 	/*
 	 * For non-array types, we copy the data.
 	 * For array types (including string), we set a pointer.
 	 */
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		if (nelem != NULL)
 			*nelem = 0;
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_HRTIME:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		if (data == NULL)
 			return (EINVAL);
 		bcopy(NVP_VALUE(nvp), data,
 		    (size_t)i_get_value_size(type, NULL, 1));
 		if (nelem != NULL)
 			*nelem = 1;
 		break;
 
 	case DATA_TYPE_NVLIST:
 	case DATA_TYPE_STRING:
 		if (data == NULL)
 			return (EINVAL);
 		*(void **)data = (void *)NVP_VALUE(nvp);
 		if (nelem != NULL)
 			*nelem = 1;
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_BYTE_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 	case DATA_TYPE_STRING_ARRAY:
 	case DATA_TYPE_NVLIST_ARRAY:
 		if (nelem == NULL || data == NULL)
 			return (EINVAL);
 		if ((*nelem = NVP_NELEM(nvp)) != 0)
 			*(void **)data = (void *)NVP_VALUE(nvp);
 		else
 			*(void **)data = NULL;
 		break;
 
 	default:
 		return (ENOTSUP);
 	}
 
 	return (0);
 }
 
 static int
 nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
     uint_t *nelem, void *data)
 {
 	nvpriv_t *priv;
 	nvpair_t *nvp;
 	i_nvp_t *curr;
 
 	if (name == NULL || nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
 		return (ENOTSUP);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		nvp = &curr->nvi_nvp;
 
 		if (strcmp(name, NVP_NAME(nvp)) == 0 && NVP_TYPE(nvp) == type)
 			return (nvpair_value_common(nvp, type, nelem, data));
 	}
 
 	return (ENOENT);
 }
 
 int
 nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
 }
 
 int
 nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
 {
 	return (nvlist_lookup_common(nvl, name,
 	    DATA_TYPE_BOOLEAN_VALUE, NULL, val));
 }
 
 int
 nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
 }
 
 int
 nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
 }
 
 int
 nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
 }
 
 int
 nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
 }
 
 int
 nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
 }
 
 int
 nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
 }
 
 int
 nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
 }
 
 int
 nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
 }
 
 int
 nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
 }
 
 #if !defined(_KERNEL)
 int
 nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
 }
 #endif
 
 int
 nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
 }
 
 int
 nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
 }
 
 int
 nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
     boolean_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name,
 	    DATA_TYPE_BOOLEAN_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
     uchar_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
     uint8_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
     int16_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
     uint16_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
     int32_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
     uint32_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
     int64_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
     uint64_t **a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
     char ***a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
     nvlist_t ***a, uint_t *n)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
 }
 
 int
 nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
 {
 	return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
 }
 
 int
 nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
 {
 	va_list ap;
 	char *name;
 	int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
 	int ret = 0;
 
 	va_start(ap, flag);
 	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
 		data_type_t type;
 		void *val;
 		uint_t *nelem;
 
 		switch (type = va_arg(ap, data_type_t)) {
 		case DATA_TYPE_BOOLEAN:
 			ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
 			break;
 
 		case DATA_TYPE_BOOLEAN_VALUE:
 		case DATA_TYPE_BYTE:
 		case DATA_TYPE_INT8:
 		case DATA_TYPE_UINT8:
 		case DATA_TYPE_INT16:
 		case DATA_TYPE_UINT16:
 		case DATA_TYPE_INT32:
 		case DATA_TYPE_UINT32:
 		case DATA_TYPE_INT64:
 		case DATA_TYPE_UINT64:
 		case DATA_TYPE_HRTIME:
 		case DATA_TYPE_STRING:
 		case DATA_TYPE_NVLIST:
 #if !defined(_KERNEL)
 		case DATA_TYPE_DOUBLE:
 #endif
 			val = va_arg(ap, void *);
 			ret = nvlist_lookup_common(nvl, name, type, NULL, val);
 			break;
 
 		case DATA_TYPE_BYTE_ARRAY:
 		case DATA_TYPE_BOOLEAN_ARRAY:
 		case DATA_TYPE_INT8_ARRAY:
 		case DATA_TYPE_UINT8_ARRAY:
 		case DATA_TYPE_INT16_ARRAY:
 		case DATA_TYPE_UINT16_ARRAY:
 		case DATA_TYPE_INT32_ARRAY:
 		case DATA_TYPE_UINT32_ARRAY:
 		case DATA_TYPE_INT64_ARRAY:
 		case DATA_TYPE_UINT64_ARRAY:
 		case DATA_TYPE_STRING_ARRAY:
 		case DATA_TYPE_NVLIST_ARRAY:
 			val = va_arg(ap, void *);
 			nelem = va_arg(ap, uint_t *);
 			ret = nvlist_lookup_common(nvl, name, type, nelem, val);
 			break;
 
 		default:
 			ret = EINVAL;
 		}
 
 		if (ret == ENOENT && noentok)
 			ret = 0;
 	}
 	va_end(ap);
 
 	return (ret);
 }
 
 /*
  * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
  * returns zero and a pointer to the matching nvpair is returned in '*ret'
  * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
  * multiple levels of embedded nvlists, with 'sep' as the separator. As an
  * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
  * "a.d[3].e[1]".  This matches the C syntax for array embed (for convience,
  * code also supports "a.d[3]e[1]" syntax).
  *
  * If 'ip' is non-NULL and the last name component is an array, return the
  * value of the "...[index]" array index in *ip. For an array reference that
  * is not indexed, *ip will be returned as -1. If there is a syntax error in
  * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
  * inside the 'name' string where the syntax error was detected.
  */
 static int
 nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
     nvpair_t **ret, int *ip, char **ep)
 {
 	nvpair_t	*nvp;
 	const char	*np;
 	char		*sepp;
 	char		*idxp, *idxep;
 	nvlist_t	**nva;
 	long		idx;
 	int		n;
 
 	if (ip)
 		*ip = -1;			/* not indexed */
 	if (ep)
 		*ep = NULL;
 
 	if ((nvl == NULL) || (name == NULL))
 		return (EINVAL);
 
 	sepp = NULL;
 	idx = 0;
 	/* step through components of name */
 	for (np = name; np && *np; np = sepp) {
 		/* ensure unique names */
 		if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
 			return (ENOTSUP);
 
 		/* skip white space */
 		skip_whitespace(np);
 		if (*np == 0)
 			break;
 
 		/* set 'sepp' to end of current component 'np' */
 		if (sep)
 			sepp = strchr(np, sep);
 		else
 			sepp = NULL;
 
 		/* find start of next "[ index ]..." */
 		idxp = strchr(np, '[');
 
 		/* if sepp comes first, set idxp to NULL */
 		if (sepp && idxp && (sepp < idxp))
 			idxp = NULL;
 
 		/*
 		 * At this point 'idxp' is set if there is an index
 		 * expected for the current component.
 		 */
 		if (idxp) {
 			/* set 'n' to length of current 'np' name component */
 			n = idxp++ - np;
 
 			/* keep sepp up to date for *ep use as we advance */
 			skip_whitespace(idxp);
 			sepp = idxp;
 
 			/* determine the index value */
 #if defined(_KERNEL) && !defined(_BOOT)
 			if (ddi_strtol(idxp, &idxep, 0, &idx))
 				goto fail;
 #else
 			idx = strtol(idxp, &idxep, 0);
 #endif
 			if (idxep == idxp)
 				goto fail;
 
 			/* keep sepp up to date for *ep use as we advance */
 			sepp = idxep;
 
 			/* skip white space index value and check for ']' */
 			skip_whitespace(sepp);
 			if (*sepp++ != ']')
 				goto fail;
 
 			/* for embedded arrays, support C syntax: "a[1].b" */
 			skip_whitespace(sepp);
 			if (sep && (*sepp == sep))
 				sepp++;
 		} else if (sepp) {
 			n = sepp++ - np;
 		} else {
 			n = strlen(np);
 		}
 
 		/* trim trailing whitespace by reducing length of 'np' */
 		if (n == 0)
 			goto fail;
 		for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
 			;
 		n++;
 
 		/* skip whitespace, and set sepp to NULL if complete */
 		if (sepp) {
 			skip_whitespace(sepp);
 			if (*sepp == 0)
 				sepp = NULL;
 		}
 
 		/*
 		 * At this point:
 		 * o  'n' is the length of current 'np' component.
 		 * o  'idxp' is set if there was an index, and value 'idx'.
 		 * o  'sepp' is set to the beginning of the next component,
 		 *    and set to NULL if we have no more components.
 		 *
 		 * Search for nvpair with matching component name.
 		 */
 		for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(nvl, nvp)) {
 
 			/* continue if no match on name */
 			if (strncmp(np, nvpair_name(nvp), n) ||
 			    (strlen(nvpair_name(nvp)) != n))
 				continue;
 
 			/* if indexed, verify type is array oriented */
 			if (idxp && !nvpair_type_is_array(nvp))
 				goto fail;
 
 			/*
 			 * Full match found, return nvp and idx if this
 			 * was the last component.
 			 */
 			if (sepp == NULL) {
 				if (ret)
 					*ret = nvp;
 				if (ip && idxp)
 					*ip = (int)idx;	/* return index */
 				return (0);		/* found */
 			}
 
 			/*
 			 * More components: current match must be
 			 * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
 			 * to support going deeper.
 			 */
 			if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
 				nvl = EMBEDDED_NVL(nvp);
 				break;
 			} else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
 				(void) nvpair_value_nvlist_array(nvp,
 				    &nva, (uint_t *)&n);
 				if ((n < 0) || (idx >= n))
 					goto fail;
 				nvl = nva[idx];
 				break;
 			}
 
 			/* type does not support more levels */
 			goto fail;
 		}
 		if (nvp == NULL)
 			goto fail;		/* 'name' not found */
 
 		/* search for match of next component in embedded 'nvl' list */
 	}
 
 fail:	if (ep && sepp)
 		*ep = sepp;
 	return (EINVAL);
 }
 
 /*
  * Return pointer to nvpair with specified 'name'.
  */
 int
 nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
 {
 	return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
 }
 
 /*
  * Determine if named nvpair exists in nvlist (use embedded separator of '.'
  * and return array index).  See nvlist_lookup_nvpair_ei_sep for more detailed
  * description.
  */
 int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
     const char *name, nvpair_t **ret, int *ip, char **ep)
 {
 	return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
 }
 
 boolean_t
 nvlist_exists(nvlist_t *nvl, const char *name)
 {
 	nvpriv_t *priv;
 	nvpair_t *nvp;
 	i_nvp_t *curr;
 
 	if (name == NULL || nvl == NULL ||
 	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (B_FALSE);
 
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		nvp = &curr->nvi_nvp;
 
 		if (strcmp(name, NVP_NAME(nvp)) == 0)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
 }
 
 int
 nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
 }
 
 int
 nvpair_value_int8(nvpair_t *nvp, int8_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
 }
 
 int
 nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
 }
 
 int
 nvpair_value_int16(nvpair_t *nvp, int16_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
 }
 
 int
 nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
 }
 
 int
 nvpair_value_int32(nvpair_t *nvp, int32_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
 }
 
 int
 nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
 }
 
 int
 nvpair_value_int64(nvpair_t *nvp, int64_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
 }
 
 int
 nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
 }
 
 #if !defined(_KERNEL)
 int
 nvpair_value_double(nvpair_t *nvp, double *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
 }
 #endif
 
 int
 nvpair_value_string(nvpair_t *nvp, char **val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
 }
 
 int
 nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
 }
 
 int
 nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
 }
 
 int
 nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
 {
 	return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
 }
 
 /*
  * Add specified pair to the list.
  */
 int
 nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 {
 	if (nvl == NULL || nvp == NULL)
 		return (EINVAL);
 
 	return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
 	    NVP_NELEM(nvp), NVP_VALUE(nvp)));
 }
 
 /*
  * Merge the supplied nvlists and put the result in dst.
  * The merged list will contain all names specified in both lists,
  * the values are taken from nvl in the case of duplicates.
  * Return 0 on success.
  */
 /*ARGSUSED*/
 int
 nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
 {
 	if (nvl == NULL || dst == NULL)
 		return (EINVAL);
 
 	if (dst != nvl)
 		return (nvlist_copy_pairs(nvl, dst));
 
 	return (0);
 }
 
 /*
  * Encoding related routines
  */
 #define	NVS_OP_ENCODE	0
 #define	NVS_OP_DECODE	1
 #define	NVS_OP_GETSIZE	2
 
 typedef struct nvs_ops nvs_ops_t;
 
 typedef struct {
 	int		nvs_op;
 	const nvs_ops_t	*nvs_ops;
 	void		*nvs_private;
 	nvpriv_t	*nvs_priv;
 } nvstream_t;
 
 /*
  * nvs operations are:
  *   - nvs_nvlist
  *     encoding / decoding of a nvlist header (nvlist_t)
  *     calculates the size used for header and end detection
  *
  *   - nvs_nvpair
  *     responsible for the first part of encoding / decoding of an nvpair
  *     calculates the decoded size of an nvpair
  *
  *   - nvs_nvp_op
  *     second part of encoding / decoding of an nvpair
  *
  *   - nvs_nvp_size
  *     calculates the encoding size of an nvpair
  *
  *   - nvs_nvl_fini
  *     encodes the end detection mark (zeros).
  */
 struct nvs_ops {
 	int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
 	int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
 	int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
 	int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
 	int (*nvs_nvl_fini)(nvstream_t *);
 };
 
 typedef struct {
 	char	nvh_encoding;	/* nvs encoding method */
 	char	nvh_endian;	/* nvs endian */
 	char	nvh_reserved1;	/* reserved for future use */
 	char	nvh_reserved2;	/* reserved for future use */
 } nvs_header_t;
 
 static int
 nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr;
 
 	/*
 	 * Walk nvpair in list and encode each nvpair
 	 */
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
 		if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
 			return (EFAULT);
 
 	return (nvs->nvs_ops->nvs_nvl_fini(nvs));
 }
 
 static int
 nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
 {
 	nvpair_t *nvp;
 	size_t nvsize;
 	int err;
 
 	/*
 	 * Get decoded size of next pair in stream, alloc
 	 * memory for nvpair_t, then decode the nvpair
 	 */
 	while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
 		if (nvsize == 0) /* end of list */
 			break;
 
 		/* make sure len makes sense */
 		if (nvsize < NVP_SIZE_CALC(1, 0))
 			return (EFAULT);
 
 		if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
 			return (ENOMEM);
 
 		if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
 			nvp_buf_free(nvl, nvp);
 			return (err);
 		}
 
 		if (i_validate_nvpair(nvp) != 0) {
 			nvpair_free(nvp);
 			nvp_buf_free(nvl, nvp);
 			return (EFAULT);
 		}
 
 		nvp_buf_link(nvl, nvp);
 	}
 	return (err);
 }
 
 static int
 nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
 {
 	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
 	i_nvp_t *curr;
 	uint64_t nvsize = *buflen;
 	size_t size;
 
 	/*
 	 * Get encoded size of nvpairs in nvlist
 	 */
 	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
 		if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
 			return (EINVAL);
 
 		if ((nvsize += size) > INT32_MAX)
 			return (EINVAL);
 	}
 
 	*buflen = nvsize;
 	return (0);
 }
 
 static int
 nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
 {
 	int err;
 
 	if (nvl->nvl_priv == 0)
 		return (EFAULT);
 
 	/*
 	 * Perform the operation, starting with header, then each nvpair
 	 */
 	if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
 		return (err);
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		err = nvs_encode_pairs(nvs, nvl);
 		break;
 
 	case NVS_OP_DECODE:
 		err = nvs_decode_pairs(nvs, nvl);
 		break;
 
 	case NVS_OP_GETSIZE:
 		err = nvs_getsize_pairs(nvs, nvl, buflen);
 		break;
 
 	default:
 		err = EINVAL;
 	}
 
 	return (err);
 }
 
 static int
 nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		return (nvs_operation(nvs, embedded, NULL));
 
 	case NVS_OP_DECODE: {
 		nvpriv_t *priv;
 		int err;
 
 		if (embedded->nvl_version != NV_VERSION)
 			return (ENOTSUP);
 
 		if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
 			return (ENOMEM);
 
 		nvlist_init(embedded, embedded->nvl_nvflag, priv);
 
 		if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
 			nvlist_free(embedded);
 		return (err);
 	}
 	default:
 		break;
 	}
 
 	return (EINVAL);
 }
 
 static int
 nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	size_t nelem = NVP_NELEM(nvp);
 	nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
 	int i;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		for (i = 0; i < nelem; i++)
 			if (nvs_embedded(nvs, nvlp[i]) != 0)
 				return (EFAULT);
 		break;
 
 	case NVS_OP_DECODE: {
 		size_t len = nelem * sizeof (uint64_t);
 		nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
 
 		bzero(nvlp, len);	/* don't trust packed data */
 		for (i = 0; i < nelem; i++) {
 			if (nvs_embedded(nvs, embedded) != 0) {
 				nvpair_free(nvp);
 				return (EFAULT);
 			}
 
 			nvlp[i] = embedded++;
 		}
 		break;
 	}
 	case NVS_OP_GETSIZE: {
 		uint64_t nvsize = 0;
 
 		for (i = 0; i < nelem; i++) {
 			size_t nvp_sz = 0;
 
 			if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
 				return (EINVAL);
 
 			if ((nvsize += nvp_sz) > INT32_MAX)
 				return (EINVAL);
 		}
 
 		*size = nvsize;
 		break;
 	}
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
 static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
 
 /*
  * Common routine for nvlist operations:
  * encode, decode, getsize (encoded size).
  */
 static int
 nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
     int nvs_op)
 {
 	int err = 0;
 	nvstream_t nvs;
 	int nvl_endian;
 #if BYTE_ORDER == _LITTLE_ENDIAN
 	int host_endian = 1;
 #else
 	int host_endian = 0;
 #endif	/* _LITTLE_ENDIAN */
 	nvs_header_t *nvh = (void *)buf;
 
 	if (buflen == NULL || nvl == NULL ||
 	    (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
 		return (EINVAL);
 
 	nvs.nvs_op = nvs_op;
 
 	/*
 	 * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
 	 * a buffer is allocated.  The first 4 bytes in the buffer are
 	 * used for encoding method and host endian.
 	 */
 	switch (nvs_op) {
 	case NVS_OP_ENCODE:
 		if (buf == NULL || *buflen < sizeof (nvs_header_t))
 			return (EINVAL);
 
 		nvh->nvh_encoding = encoding;
 		nvh->nvh_endian = nvl_endian = host_endian;
 		nvh->nvh_reserved1 = 0;
 		nvh->nvh_reserved2 = 0;
 		break;
 
 	case NVS_OP_DECODE:
 		if (buf == NULL || *buflen < sizeof (nvs_header_t))
 			return (EINVAL);
 
 		/* get method of encoding from first byte */
 		encoding = nvh->nvh_encoding;
 		nvl_endian = nvh->nvh_endian;
 		break;
 
 	case NVS_OP_GETSIZE:
 		nvl_endian = host_endian;
 
 		/*
 		 * add the size for encoding
 		 */
 		*buflen = sizeof (nvs_header_t);
 		break;
 
 	default:
 		return (ENOTSUP);
 	}
 
 	/*
 	 * Create an nvstream with proper encoding method
 	 */
 	switch (encoding) {
 	case NV_ENCODE_NATIVE:
 		/*
 		 * check endianness, in case we are unpacking
 		 * from a file
 		 */
 		if (nvl_endian != host_endian)
 			return (ENOTSUP);
 		err = nvs_native(&nvs, nvl, buf, buflen);
 		break;
 	case NV_ENCODE_XDR:
 		err = nvs_xdr(&nvs, nvl, buf, buflen);
 		break;
 	default:
 		err = ENOTSUP;
 		break;
 	}
 
 	return (err);
 }
 
 int
 nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
 {
 	return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
 }
 
 /*
  * Pack nvlist into contiguous memory
  */
 /*ARGSUSED1*/
 int
 nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
     int kmflag)
 {
 #if defined(_KERNEL) && !defined(_BOOT)
 	return (nvlist_xpack(nvl, bufp, buflen, encoding,
 	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
 #else
 	return (nvlist_xpack(nvl, bufp, buflen, encoding, nv_alloc_nosleep));
 #endif
 }
 
 int
 nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
     nv_alloc_t *nva)
 {
 	nvpriv_t nvpriv;
 	size_t alloc_size;
 	char *buf;
 	int err;
 
 	if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
 		return (EINVAL);
 
 	if (*bufp != NULL)
 		return (nvlist_common(nvl, *bufp, buflen, encoding,
 		    NVS_OP_ENCODE));
 
 	/*
 	 * Here is a difficult situation:
 	 * 1. The nvlist has fixed allocator properties.
 	 *    All other nvlist routines (like nvlist_add_*, ...) use
 	 *    these properties.
-	 * 2. When using nvlist_pack() the user can specify his own
+	 * 2. When using nvlist_pack() the user can specify their own
 	 *    allocator properties (e.g. by using KM_NOSLEEP).
 	 *
 	 * We use the user specified properties (2). A clearer solution
 	 * will be to remove the kmflag from nvlist_pack(), but we will
 	 * not change the interface.
 	 */
 	nv_priv_init(&nvpriv, nva, 0);
 
 	if ((err = nvlist_size(nvl, &alloc_size, encoding)))
 		return (err);
 
 	if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
 		return (ENOMEM);
 
 	if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
 	    NVS_OP_ENCODE)) != 0) {
 		nv_mem_free(&nvpriv, buf, alloc_size);
 	} else {
 		*buflen = alloc_size;
 		*bufp = buf;
 	}
 
 	return (err);
 }
 
 /*
  * Unpack buf into an nvlist_t
  */
 /*ARGSUSED1*/
 int
 nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
 {
 #if defined(_KERNEL) && !defined(_BOOT)
 	return (nvlist_xunpack(buf, buflen, nvlp,
 	    (kmflag == KM_SLEEP ? nv_alloc_sleep : nv_alloc_nosleep)));
 #else
 	return (nvlist_xunpack(buf, buflen, nvlp, nv_alloc_nosleep));
 #endif
 }
 
 int
 nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
 {
 	nvlist_t *nvl;
 	int err;
 
 	if (nvlp == NULL)
 		return (EINVAL);
 
 	if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
 		return (err);
 
 	if ((err = nvlist_common(nvl, buf, &buflen, 0, NVS_OP_DECODE)) != 0)
 		nvlist_free(nvl);
 	else
 		*nvlp = nvl;
 
 	return (err);
 }
 
 /*
  * Native encoding functions
  */
 typedef struct {
 	/*
 	 * This structure is used when decoding a packed nvpair in
 	 * the native format.  n_base points to a buffer containing the
 	 * packed nvpair.  n_end is a pointer to the end of the buffer.
 	 * (n_end actually points to the first byte past the end of the
 	 * buffer.)  n_curr is a pointer that lies between n_base and n_end.
 	 * It points to the current data that we are decoding.
 	 * The amount of data left in the buffer is equal to n_end - n_curr.
 	 * n_flag is used to recognize a packed embedded list.
 	 */
 	caddr_t n_base;
 	caddr_t n_end;
 	caddr_t n_curr;
 	uint_t  n_flag;
 } nvs_native_t;
 
 static int
 nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
     size_t buflen)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		nvs->nvs_private = native;
 		native->n_curr = native->n_base = buf;
 		native->n_end = buf + buflen;
 		native->n_flag = 0;
 		return (0);
 
 	case NVS_OP_GETSIZE:
 		nvs->nvs_private = native;
 		native->n_curr = native->n_base = native->n_end = NULL;
 		native->n_flag = 0;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 /*ARGSUSED*/
 static void
 nvs_native_destroy(nvstream_t *nvs)
 {
 }
 
 static int
 native_cp(nvstream_t *nvs, void *buf, size_t size)
 {
 	nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 
 	if (native->n_curr + size > native->n_end)
 		return (EFAULT);
 
 	/*
 	 * The bcopy() below eliminates alignment requirement
 	 * on the buffer (stream) and is preferred over direct access.
 	 */
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		bcopy(buf, native->n_curr, size);
 		break;
 	case NVS_OP_DECODE:
 		bcopy(native->n_curr, buf, size);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	native->n_curr += size;
 	return (0);
 }
 
 /*
  * operate on nvlist_t header
  */
 static int
 nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
 {
 	nvs_native_t *native = nvs->nvs_private;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		if (native->n_flag)
 			return (0);	/* packed embedded list */
 
 		native->n_flag = 1;
 
 		/* copy version and nvflag of the nvlist_t */
 		if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
 		    native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
 			return (EFAULT);
 
 		return (0);
 
 	case NVS_OP_GETSIZE:
 		/*
 		 * if calculate for packed embedded list
 		 * 	4 for end of the embedded list
 		 * else
 		 * 	2 * sizeof (int32_t) for nvl_version and nvl_nvflag
 		 * 	and 4 for end of the entire list
 		 */
 		if (native->n_flag) {
 			*size += 4;
 		} else {
 			native->n_flag = 1;
 			*size += 2 * sizeof (int32_t) + 4;
 		}
 
 		return (0);
 
 	default:
 		return (EINVAL);
 	}
 }
 
 static int
 nvs_native_nvl_fini(nvstream_t *nvs)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		/*
 		 * Add 4 zero bytes at end of nvlist. They are used
 		 * for end detection by the decode routine.
 		 */
 		if (native->n_curr + sizeof (int) > native->n_end)
 			return (EFAULT);
 
 		bzero(native->n_curr, sizeof (int));
 		native->n_curr += sizeof (int);
 	}
 
 	return (0);
 }
 
 static int
 nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		char *packed = (void *)
 		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
 		/*
 		 * Null out the pointer that is meaningless in the packed
 		 * structure. The address may not be aligned, so we have
 		 * to use bzero.
 		 */
 		bzero(packed + offsetof(nvlist_t, nvl_priv),
 		    sizeof(((nvlist_t *)NULL)->nvl_priv));
 	}
 
 	return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
 }
 
 static int
 nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
 		size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
 		int i;
 		/*
 		 * Null out pointers that are meaningless in the packed
 		 * structure. The addresses may not be aligned, so we have
 		 * to use bzero.
 		 */
 		bzero(value, len);
 
 		value += len;
 		for (i = 0; i < NVP_NELEM(nvp); i++) {
 			/*
 			 * Null out the pointer that is meaningless in the
 			 * packed structure. The address may not be aligned,
 			 * so we have to use bzero.
 			 */
 			bzero(value + offsetof(nvlist_t, nvl_priv),
 			    sizeof(((nvlist_t *)NULL)->nvl_priv));
 			value += sizeof(nvlist_t);
 		}
 	}
 
 	return (nvs_embedded_nvl_array(nvs, nvp, NULL));
 }
 
 static void
 nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE: {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		uint64_t *strp = (void *)
 		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
 		/*
 		 * Null out pointers that are meaningless in the packed
 		 * structure. The addresses may not be aligned, so we have
 		 * to use bzero.
 		 */
 		bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
 		break;
 	}
 	case NVS_OP_DECODE: {
 		char **strp = (void *)NVP_VALUE(nvp);
 		char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
 		int i;
 
 		for (i = 0; i < NVP_NELEM(nvp); i++) {
 			strp[i] = buf;
 			buf += strlen(buf) + 1;
 		}
 		break;
 	}
 	}
 }
 
 static int
 nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 {
 	data_type_t type;
 	int value_sz;
 	int ret = 0;
 
 	/*
 	 * We do the initial bcopy of the data before we look at
 	 * the nvpair type, because when we're decoding, we won't
 	 * have the correct values for the pair until we do the bcopy.
 	 */
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
 			return (EFAULT);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	/* verify nvp_name_sz, check the name string length */
 	if (i_validate_nvpair_name(nvp) != 0)
 		return (EFAULT);
 
 	type = NVP_TYPE(nvp);
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
 	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
 		return (EFAULT);
 
 	if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
 		return (EFAULT);
 
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		ret = nvpair_native_embedded(nvs, nvp);
 		break;
 	case DATA_TYPE_NVLIST_ARRAY:
 		ret = nvpair_native_embedded_array(nvs, nvp);
 		break;
 	case DATA_TYPE_STRING_ARRAY:
 		nvpair_native_string_array(nvs, nvp);
 		break;
 	default:
 		break;
 	}
 
 	return (ret);
 }
 
 static int
 nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	uint64_t nvp_sz = nvp->nvp_size;
 
 	switch (NVP_TYPE(nvp)) {
 	case DATA_TYPE_NVLIST: {
 		size_t nvsize = 0;
 
 		if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 	case DATA_TYPE_NVLIST_ARRAY: {
 		size_t nvsize;
 
 		if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 	default:
 		break;
 	}
 
 	if (nvp_sz > INT32_MAX)
 		return (EINVAL);
 
 	*size = nvp_sz;
 
 	return (0);
 }
 
 static int
 nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		return (nvs_native_nvp_op(nvs, nvp));
 
 	case NVS_OP_DECODE: {
 		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
 		int32_t decode_len;
 
 		/* try to read the size value from the stream */
 		if (native->n_curr + sizeof (int32_t) > native->n_end)
 			return (EFAULT);
 		bcopy(native->n_curr, &decode_len, sizeof (int32_t));
 
 		/* sanity check the size value */
 		if (decode_len < 0 ||
 		    decode_len > native->n_end - native->n_curr)
 			return (EFAULT);
 
 		*size = decode_len;
 
 		/*
 		 * If at the end of the stream then move the cursor
 		 * forward, otherwise nvpair_native_op() will read
 		 * the entire nvpair at the same cursor position.
 		 */
 		if (*size == 0)
 			native->n_curr += sizeof (int32_t);
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static const nvs_ops_t nvs_native_ops = {
 	nvs_native_nvlist,
 	nvs_native_nvpair,
 	nvs_native_nvp_op,
 	nvs_native_nvp_size,
 	nvs_native_nvl_fini
 };
 
 static int
 nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
 {
 	nvs_native_t native;
 	int err;
 
 	nvs->nvs_ops = &nvs_native_ops;
 
 	if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
 	    *buflen - sizeof (nvs_header_t))) != 0)
 		return (err);
 
 	err = nvs_operation(nvs, nvl, buflen);
 
 	nvs_native_destroy(nvs);
 
 	return (err);
 }
 
 /*
  * XDR encoding functions
  *
  * An xdr packed nvlist is encoded as:
  *
  *  - encoding methode and host endian (4 bytes)
  *  - nvl_version (4 bytes)
  *  - nvl_nvflag (4 bytes)
  *
  *  - encoded nvpairs, the format of one xdr encoded nvpair is:
  *	- encoded size of the nvpair (4 bytes)
  *	- decoded size of the nvpair (4 bytes)
  *	- name string, (4 + sizeof(NV_ALIGN4(string))
  *	  a string is coded as size (4 bytes) and data
  *	- data type (4 bytes)
  *	- number of elements in the nvpair (4 bytes)
  *	- data
  *
  *  - 2 zero's for end of the entire list (8 bytes)
  */
 static int
 nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
 {
 	/* xdr data must be 4 byte aligned */
 	if ((ulong_t)buf % 4 != 0)
 		return (EFAULT);
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
 		nvs->nvs_private = xdr;
 		return (0);
 	case NVS_OP_DECODE:
 		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
 		nvs->nvs_private = xdr;
 		return (0);
 	case NVS_OP_GETSIZE:
 		nvs->nvs_private = NULL;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static void
 nvs_xdr_destroy(nvstream_t *nvs)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE:
 		xdr_destroy((XDR *)nvs->nvs_private);
 		break;
 	default:
 		break;
 	}
 }
 
 static int
 nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
 {
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE:
 	case NVS_OP_DECODE: {
 		XDR 	*xdr = nvs->nvs_private;
 
 		if (!xdr_int(xdr, &nvl->nvl_version) ||
 		    !xdr_u_int(xdr, &nvl->nvl_nvflag))
 			return (EFAULT);
 		break;
 	}
 	case NVS_OP_GETSIZE: {
 		/*
 		 * 2 * 4 for nvl_version + nvl_nvflag
 		 * and 8 for end of the entire list
 		 */
 		*size += 2 * 4 + 8;
 		break;
 	}
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 nvs_xdr_nvl_fini(nvstream_t *nvs)
 {
 	if (nvs->nvs_op == NVS_OP_ENCODE) {
 		XDR *xdr = nvs->nvs_private;
 		int zero = 0;
 
 		if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
 			return (EFAULT);
 	}
 
 	return (0);
 }
 
 /*
  * The format of xdr encoded nvpair is:
  * encode_size, decode_size, name string, data type, nelem, data
  */
 static int
 nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
 {
 	data_type_t type;
 	char	*buf;
 	char	*buf_end = (char *)nvp + nvp->nvp_size;
 	int	value_sz;
 	uint_t	nelem, buflen;
 	bool_t	ret = FALSE;
 	XDR	*xdr = nvs->nvs_private;
 
 	ASSERT(xdr != NULL && nvp != NULL);
 
 	/* name string */
 	if ((buf = NVP_NAME(nvp)) >= buf_end)
 		return (EFAULT);
 	buflen = buf_end - buf;
 
 	if (!xdr_string(xdr, &buf, buflen - 1))
 		return (EFAULT);
 	nvp->nvp_name_sz = strlen(buf) + 1;
 
 	/* type and nelem */
 	if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
 	    !xdr_int(xdr, &nvp->nvp_value_elem))
 		return (EFAULT);
 
 	type = NVP_TYPE(nvp);
 	nelem = nvp->nvp_value_elem;
 
 	/*
 	 * Verify type and nelem and get the value size.
 	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
 	 * is the size of the string(s) excluded.
 	 */
 	if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
 		return (EFAULT);
 
 	/* if there is no data to extract then return */
 	if (nelem == 0)
 		return (0);
 
 	/* value */
 	if ((buf = NVP_VALUE(nvp)) >= buf_end)
 		return (EFAULT);
 	buflen = buf_end - buf;
 
 	if (buflen < value_sz)
 		return (EFAULT);
 
 	switch (type) {
 	case DATA_TYPE_NVLIST:
 		if (nvs_embedded(nvs, (void *)buf) == 0)
 			return (0);
 		break;
 
 	case DATA_TYPE_NVLIST_ARRAY:
 		if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
 			return (0);
 		break;
 
 	case DATA_TYPE_BOOLEAN:
 		ret = TRUE;
 		break;
 
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 		ret = xdr_char(xdr, buf);
 		break;
 
 	case DATA_TYPE_INT16:
 		ret = xdr_short(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT16:
 		ret = xdr_u_short(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_INT32:
 		ret = xdr_int(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT32:
 		ret = xdr_u_int(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_INT64:
 		ret = xdr_longlong_t(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_UINT64:
 		ret = xdr_u_longlong_t(xdr, (void *)buf);
 		break;
 
 	case DATA_TYPE_HRTIME:
 		/*
 		 * NOTE: must expose the definition of hrtime_t here
 		 */
 		ret = xdr_longlong_t(xdr, (void *)buf);
 		break;
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 		ret = xdr_double(xdr, (void *)buf);
 		break;
 #endif
 	case DATA_TYPE_STRING:
 		ret = xdr_string(xdr, &buf, buflen - 1);
 		break;
 
 	case DATA_TYPE_BYTE_ARRAY:
 		ret = xdr_opaque(xdr, buf, nelem);
 		break;
 
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
 		    (xdrproc_t)xdr_char);
 		break;
 
 	case DATA_TYPE_INT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
 		    sizeof (int16_t), (xdrproc_t)xdr_short);
 		break;
 
 	case DATA_TYPE_UINT16_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
 		    sizeof (uint16_t), (xdrproc_t)xdr_u_short);
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
 		    sizeof (int32_t), (xdrproc_t)xdr_int);
 		break;
 
 	case DATA_TYPE_UINT32_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
 		    sizeof (uint32_t), (xdrproc_t)xdr_u_int);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
 		    sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
 		break;
 
 	case DATA_TYPE_UINT64_ARRAY:
 		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
 		    sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
 		size_t len = nelem * sizeof (uint64_t);
 		char **strp = (void *)buf;
 		int i;
 
 		if (nvs->nvs_op == NVS_OP_DECODE)
 			bzero(buf, len);	/* don't trust packed data */
 
 		for (i = 0; i < nelem; i++) {
 			if (buflen <= len)
 				return (EFAULT);
 
 			buf += len;
 			buflen -= len;
 
 			if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
 				return (EFAULT);
 
 			if (nvs->nvs_op == NVS_OP_DECODE)
 				strp[i] = buf;
 			len = strlen(buf) + 1;
 		}
 		ret = TRUE;
 		break;
 	}
 	default:
 		break;
 	}
 
 	return (ret == TRUE ? 0 : EFAULT);
 }
 
 static int
 nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	data_type_t type = NVP_TYPE(nvp);
 	/*
 	 * encode_size + decode_size + name string size + data type + nelem
 	 * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
 	 */
 	uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
 
 	switch (type) {
 	case DATA_TYPE_BOOLEAN:
 		break;
 
 	case DATA_TYPE_BOOLEAN_VALUE:
 	case DATA_TYPE_BYTE:
 	case DATA_TYPE_INT8:
 	case DATA_TYPE_UINT8:
 	case DATA_TYPE_INT16:
 	case DATA_TYPE_UINT16:
 	case DATA_TYPE_INT32:
 	case DATA_TYPE_UINT32:
 		nvp_sz += 4;	/* 4 is the minimum xdr unit */
 		break;
 
 	case DATA_TYPE_INT64:
 	case DATA_TYPE_UINT64:
 	case DATA_TYPE_HRTIME:
 #if !defined(_KERNEL)
 	case DATA_TYPE_DOUBLE:
 #endif
 		nvp_sz += 8;
 		break;
 
 	case DATA_TYPE_STRING:
 		nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
 		break;
 
 	case DATA_TYPE_BYTE_ARRAY:
 		nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
 		break;
 
 	case DATA_TYPE_BOOLEAN_ARRAY:
 	case DATA_TYPE_INT8_ARRAY:
 	case DATA_TYPE_UINT8_ARRAY:
 	case DATA_TYPE_INT16_ARRAY:
 	case DATA_TYPE_UINT16_ARRAY:
 	case DATA_TYPE_INT32_ARRAY:
 	case DATA_TYPE_UINT32_ARRAY:
 		nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
 		break;
 
 	case DATA_TYPE_INT64_ARRAY:
 	case DATA_TYPE_UINT64_ARRAY:
 		nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
 		break;
 
 	case DATA_TYPE_STRING_ARRAY: {
 		int i;
 		char **strs = (void *)NVP_VALUE(nvp);
 
 		for (i = 0; i < NVP_NELEM(nvp); i++)
 			nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
 
 		break;
 	}
 
 	case DATA_TYPE_NVLIST:
 	case DATA_TYPE_NVLIST_ARRAY: {
 		size_t nvsize = 0;
 		int old_nvs_op = nvs->nvs_op;
 		int err;
 
 		nvs->nvs_op = NVS_OP_GETSIZE;
 		if (type == DATA_TYPE_NVLIST)
 			err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
 		else
 			err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
 		nvs->nvs_op = old_nvs_op;
 
 		if (err != 0)
 			return (EINVAL);
 
 		nvp_sz += nvsize;
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 
 	if (nvp_sz > INT32_MAX)
 		return (EINVAL);
 
 	*size = nvp_sz;
 
 	return (0);
 }
 
 
 /*
  * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
  * the largest nvpair that could be encoded in the buffer.
  *
  * See comments above nvpair_xdr_op() for the format of xdr encoding.
  * The size of a xdr packed nvpair without any data is 5 words.
  *
  * Using the size of the data directly as an estimate would be ok
  * in all cases except one.  If the data type is of DATA_TYPE_STRING_ARRAY
  * then the actual nvpair has space for an array of pointers to index
  * the strings.  These pointers are not encoded into the packed xdr buffer.
  *
  * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
  * of length 0, then each string is endcoded in xdr format as a single word.
  * Therefore when expanded to an nvpair there will be 2.25 word used for
  * each string.  (a int64_t allocated for pointer usage, and a single char
  * for the null termination.)
  *
  * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
  */
 #define	NVS_XDR_HDR_LEN		((size_t)(5 * 4))
 #define	NVS_XDR_DATA_LEN(y)	(((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
 					0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
 #define	NVS_XDR_MAX_LEN(x)	(NVP_SIZE_CALC(1, 0) + \
 					(NVS_XDR_DATA_LEN(x) * 2) + \
 					NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
 
 static int
 nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
 {
 	XDR 	*xdr = nvs->nvs_private;
 	int32_t	encode_len, decode_len;
 
 	switch (nvs->nvs_op) {
 	case NVS_OP_ENCODE: {
 		size_t nvsize;
 
 		if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
 			return (EFAULT);
 
 		decode_len = nvp->nvp_size;
 		encode_len = nvsize;
 		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
 			return (EFAULT);
 
 		return (nvs_xdr_nvp_op(nvs, nvp));
 	}
 	case NVS_OP_DECODE: {
 		struct xdr_bytesrec bytesrec;
 
 		/* get the encode and decode size */
 		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
 			return (EFAULT);
 		*size = decode_len;
 
 		/* are we at the end of the stream? */
 		if (*size == 0)
 			return (0);
 
 		/* sanity check the size parameter */
 		if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
 			return (EFAULT);
 
 		if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
 			return (EFAULT);
 		break;
 	}
 
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static const struct nvs_ops nvs_xdr_ops = {
 	nvs_xdr_nvlist,
 	nvs_xdr_nvpair,
 	nvs_xdr_nvp_op,
 	nvs_xdr_nvp_size,
 	nvs_xdr_nvl_fini
 };
 
 static int
 nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
 {
 	XDR xdr;
 	int err;
 
 	nvs->nvs_ops = &nvs_xdr_ops;
 
 	if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
 	    *buflen - sizeof (nvs_header_t))) != 0)
 		return (err);
 
 	err = nvs_operation(nvs, nvl, buflen);
 
 	nvs_xdr_destroy(nvs);
 
 	return (err);
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	(revision 329627)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c	(revision 329628)
@@ -1,922 +1,922 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/uio.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/sunddi.h>
 #include <sys/random.h>
 #include <sys/policy.h>
 #include <sys/kcondvar.h>
 #include <sys/callb.h>
 #include <sys/smp.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/dmu.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/dnlc.h>
 #include <sys/extdirent.h>
 
 /*
  * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
     matchtype_t mt, uint64_t *zoid)
 {
 	int error;
 
 	if (zfsvfs->z_norm) {
 
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
 		    zoid, mt, NULL, 0, NULL);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
 	return (error);
 }
 
 /*
  * Look up a directory entry under a locked vnode.
  * dvp being locked gives us a guarantee that there are no concurrent
  * modification of the directory and, thus, if a node can be found in
  * the directory, then it must not be unlinked.
  *
  * Input arguments:
  *	dzp	- znode for directory
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
  *		  ZXATTR: we want dzp's xattr directory
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
  */
 int
 zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	matchtype_t	mt = 0;
 	uint64_t	zoid;
 	vnode_t		*vp = NULL;
 	int		error = 0;
 
 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 
 	*zpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
 	 */
 	if (name[0] == '.' &&
 	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
 	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
 		return (SET_ERROR(EEXIST));
 
 	/*
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
 	 * affect how we perform zap lookups.
 	 *
 	 * When matching we may need to normalize & change case according to
 	 * FS settings.
 	 *
 	 * Note that a normalized match is necessary for a case insensitive
 	 * filesystem when the lookup request is not exact because normalization
 	 * can fold case independent of normalizing code point sequences.
 	 *
 	 * See the table above zfs_dropname().
 	 */
 	if (zfsvfs->z_norm != 0) {
 		mt = MT_NORMALIZE;
 
 		/*
 		 * Determine if the match needs to honor the case specified in
 		 * lookup, and if so keep track of that so that during
 		 * normalization we don't fold case.
 		 */
 		if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 			mt |= MT_MATCH_CASE;
 		}
 	}
 
 	/*
 	 * Only look in or update the DNLC if we are looking for the
 	 * name on a file system that does not require normalization
 	 * or case folding.  We can also look there if we happen to be
 	 * on a non-normalizing, mixed sensitivity file system IF we
 	 * are looking for the exact name.
 	 *
 	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
 	 * because in that case MT_EXACT and MT_FIRST should produce exactly
 	 * the same result.
 	 */
 
 	if (dzp->z_unlinked && !(flag & ZXATTR))
 		return (ENOENT);
 	if (flag & ZXATTR) {
 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
 		    sizeof (zoid));
 		if (error == 0)
 			error = (zoid == 0 ? ENOENT : 0);
 	} else {
 		error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
 			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
 		if (error)
 			return (error);
 		ASSERT(!(*zpp)->z_unlinked);
 	}
 
 	return (0);
 }
 
 static int
 zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
 {
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	uint64_t parent;
 	int error;
 
 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
 	if (dzp->z_unlinked)
 		return (ENOENT);
 
 	if ((error = sa_lookup(dzp->z_sa_hdl,
 	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 		return (error);
 
 	error = zfs_zget(zfsvfs, parent, &zp);
 	if (error == 0)
 		*zpp = zp;
 	return (error);
 }
 
 int
 zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
 {
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	znode_t *zp;
 	int error = 0;
 
 	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
 	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
 
 	if (dzp->z_unlinked)
 		return (SET_ERROR(ENOENT));
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		*zpp = dzp;
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		error = zfs_dd_lookup(dzp, zpp);
 	} else {
 		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
 		if (error == 0) {
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
 			*zpp = zp;
 		}
 	}
 	return (error);
 }
 
 /*
  * unlinked Set (formerly known as the "delete queue") Error Handling
  *
  * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
  * don't specify the name of the entry that we will be manipulating.  We
  * also fib and say that we won't be adding any new entries to the
  * unlinked set, even though we might (this is to lower the minimum file
  * size that can be deleted in a full filesystem).  So on the small
  * chance that the nlink list is using a fat zap (ie. has more than
  * 2000 entries), we *may* not pre-read a block that's needed.
  * Therefore it is remotely possible for some of the assertions
  * regarding the unlinked set below to fail due to i/o error.  On a
  * nondebug system, this will result in the space being leaked.
  */
 void
 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT(zp->z_unlinked);
 	ASSERT(zp->z_links == 0);
 
 	VERIFY3U(0, ==,
 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 }
 
 /*
  * Clean up any znodes that had no links when we either crashed or
  * (force) umounted the file system.
  */
 void
 zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 {
 	zap_cursor_t	zc;
 	zap_attribute_t zap;
 	dmu_object_info_t doi;
 	znode_t		*zp;
 	int		error;
 
 	/*
 	 * Interate over the contents of the unlinked set.
 	 */
 	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
 	    zap_cursor_retrieve(&zc, &zap) == 0;
 	    zap_cursor_advance(&zc)) {
 
 		/*
 		 * See what kind of object we have in list
 		 */
 
 		error = dmu_object_info(zfsvfs->z_os,
 		    zap.za_first_integer, &doi);
 		if (error != 0)
 			continue;
 
 		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
 		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
 		/*
 		 * We need to re-mark these list entries for deletion,
 		 * so we pull them back into core and set zp->z_unlinked.
 		 */
 		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
 
 		/*
 		 * We may pick up znodes that are already marked for deletion.
 		 * This could happen during the purge of an extended attribute
 		 * directory.  All we need to do is skip over them, since they
 		 * are already in the system marked z_unlinked.
 		 */
 		if (error != 0)
 			continue;
 
 		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
 		zp->z_unlinked = B_TRUE;
 		vput(ZTOV(zp));
 	}
 	zap_cursor_fini(&zc);
 }
 
 /*
  * Delete the entire contents of a directory.  Return a count
  * of the number of entries that could not be deleted. If we encounter
  * an error, return a count of at least one so that the directory stays
  * in the unlinked set.
  *
  * NOTE: this function assumes that the directory is inactive,
  *	so there is no need to lock its entries before deletion.
  *	Also, it assumes the directory contents is *only* regular
  *	files.
  */
 static int
 zfs_purgedir(znode_t *dzp)
 {
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	int skipped = 0;
 	int error;
 
 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
 	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
 	    zap_cursor_advance(&zc)) {
 		error = zfs_zget(zfsvfs,
 		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
 		if (error) {
 			skipped += 1;
 			continue;
 		}
 
 		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 		ASSERT((ZTOV(xzp)->v_type == VREG) ||
 		    (ZTOV(xzp)->v_type == VLNK));
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 		/* Is this really needed ? */
 		zfs_sa_upgrade_txholds(tx, xzp);
 		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			vput(ZTOV(xzp));
 			skipped += 1;
 			continue;
 		}
 
 		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
 		vput(ZTOV(xzp));
 	}
 	zap_cursor_fini(&zc);
 	if (error != ENOENT)
 		skipped += 1;
 	return (skipped);
 }
 
 void
 zfs_rmnode(znode_t *zp)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	znode_t		*xzp = NULL;
 	dmu_tx_t	*tx;
 	uint64_t	acl_obj;
 	uint64_t	xattr_obj;
 	int		error;
 
 	ASSERT(zp->z_links == 0);
 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
 	    (zp->z_pflags & ZFS_XATTR)) {
 		if (zfs_purgedir(zp) != 0) {
 			/*
 			 * Not enough space to delete some xattrs.
 			 * Leave it in the unlinked set.
 			 */
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
 			return;
 		}
 	} else {
 		/*
 		 * Free up all the data in the file.  We don't do this for
 		 * XATTR directories because we need truncate and remove to be
 		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
 		 * we crash here we'll end up with an inconsistent truncated
 		 * zap object in the delete queue.  Note a truncated file is
 		 * harmless since it only contains user data.
 		 */
 		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
 		if (error) {
 			/*
 			 * Not enough space or we were interrupted by unmount.
 			 * Leave the file in the unlinked set.
 			 */
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_free(zp);
 			return;
 		}
 	}
 
 	/*
 	 * If the file has extended attributes, we're going to unlink
 	 * the xattr dir.
 	 */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT3S(error, ==, 0);
 		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	acl_obj = zfs_external_acl(zp);
 
 	/*
 	 * Set up the final transaction.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	if (xzp) {
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		/*
 		 * Not enough space to delete the file.  Leave it in the
 		 * unlinked set, leaking it until the fs is remounted (at
 		 * which point we'll call zfs_unlinked_drain() to process it).
 		 */
 		dmu_tx_abort(tx);
 		zfs_znode_dmu_fini(zp);
 		zfs_znode_free(zp);
 		goto out;
 	}
 
 	if (xzp) {
 		ASSERT(error == 0);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		xzp->z_links = 0;	/* no more links to it */
 		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &xzp->z_links, sizeof (xzp->z_links), tx));
 		zfs_unlinked_add(xzp, tx);
 	}
 
 	/* Remove this znode from the unlinked set */
 	VERIFY3U(0, ==,
 	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 
 	zfs_znode_delete(zp, tx);
 
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
 		vput(ZTOV(xzp));
 }
 
 static uint64_t
 zfs_dirent(znode_t *zp, uint64_t mode)
 {
 	uint64_t de = zp->z_id;
 
 	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
 		de |= IFTODT(mode) << 60;
 	return (de);
 }
 
 /*
  * Link zp into dzp.  Can only fail if zp has been unlinked.
  */
 int
 zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	uint64_t value;
 	int zp_is_dir = (vp->v_type == VDIR);
 	sa_bulk_attr_t bulk[5];
 	uint64_t mtime[2], ctime[2];
 	int count = 0;
 	int error;
 
 	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 #ifdef __FreeBSD__
 	if (zp_is_dir) {
 		if (dzp->z_links >= ZFS_LINK_MAX)
 			return (SET_ERROR(EMLINK));
 	}
 #endif
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
 			return (SET_ERROR(ENOENT));
 		}
 #ifdef __FreeBSD__
 		if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
 			return (SET_ERROR(EMLINK));
 		}
 #endif
 		zp->z_links++;
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 		    &zp->z_links, sizeof (zp->z_links));
 
 	} else {
 		ASSERT(zp->z_unlinked == 0);
 	}
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &dzp->z_id, sizeof (dzp->z_id));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (!(flag & ZNEW)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
 		    ctime, B_TRUE);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	dzp->z_size++;
 	dzp->z_links += zp_is_dir;
 	count = 0;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &dzp->z_size, sizeof (dzp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &dzp->z_links, sizeof (dzp->z_links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 	    mtime, sizeof (mtime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 	    ctime, sizeof (ctime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	value = zfs_dirent(zp, zp->z_mode);
 	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
 	    8, 1, &value, tx);
 	VERIFY0(error);
 
 	return (0);
 }
 
 /*
  * The match type in the code for this function should conform to:
  *
  * ------------------------------------------------------------------------
  * fs type  | z_norm      | lookup type | match type
  * ---------|-------------|-------------|----------------------------------
  * CS !norm | 0           |           0 | 0 (exact)
  * CS  norm | formX       |           0 | MT_NORMALIZE
  * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
  * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
  * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
  * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
  *
  * Abbreviations:
  *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
  *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
  *    formX = unicode normalization form set on fs creation
  */
 static int
 zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag)
 {
 	int error;
 
 	if (zp->z_zfsvfs->z_norm) {
 		matchtype_t mt = MT_NORMALIZE;
 
 		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
 			mt |= MT_MATCH_CASE;
 		}
 
 		error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
 		    name, mt, tx);
 	} else {
 		error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
 	}
 
 	return (error);
 }
 
 /*
  * Unlink zp from dzp, and mark zp for deletion if this was the last link.
  * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
  * and it's the caller's job to do it.
  */
 int
 zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
     int flag, boolean_t *unlinkedp)
 {
 	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
 	vnode_t *vp = ZTOV(zp);
 	int zp_is_dir = (vp->v_type == VDIR);
 	boolean_t unlinked = B_FALSE;
 	sa_bulk_attr_t bulk[5];
 	uint64_t mtime[2], ctime[2];
 	int count = 0;
 	int error;
 
 	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
 	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 
 	if (!(flag & ZRENAMING)) {
 
 		if (zp_is_dir && !zfs_dirempty(zp)) {
 #ifdef illumos
 			return (SET_ERROR(EEXIST));
 #else
 			return (SET_ERROR(ENOTEMPTY));
 #endif
 		}
 
 		/*
 		 * If we get here, we are going to try to remove the object.
 		 * First try removing the name from the directory; if that
 		 * fails, return the error.
 		 */
 		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0) {
 			return (error);
 		}
 
 		if (zp->z_links <= zp_is_dir) {
 			zfs_panic_recover("zfs: link count on vnode %p is %u, "
 			    "should be at least %u", zp->z_vnode,
 			    (int)zp->z_links,
 			    zp_is_dir + 1);
 			zp->z_links = zp_is_dir + 1;
 		}
 		if (--zp->z_links == zp_is_dir) {
 			zp->z_unlinked = B_TRUE;
 			zp->z_links = 0;
 			unlinked = B_TRUE;
 		} else {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
 			    NULL, &ctime, sizeof (ctime));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
 			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
 			    B_TRUE);
 		}
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 		    NULL, &zp->z_links, sizeof (zp->z_links));
 		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		count = 0;
 		ASSERT0(error);
 	} else {
 		ASSERT(zp->z_unlinked == 0);
 		error = zfs_dropname(dzp, name, zp, tx, flag);
 		if (error != 0)
 			return (error);
 	}
 
 	dzp->z_size--;		/* one dirent removed */
 	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 	    NULL, &dzp->z_links, sizeof (dzp->z_links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &dzp->z_size, sizeof (dzp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
 	    NULL, ctime, sizeof (ctime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 	    NULL, mtime, sizeof (mtime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
 	else if (unlinked)
 		zfs_unlinked_add(zp, tx);
 
 	return (0);
 }
 
 /*
  * Indicate whether the directory is empty.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
 	return (dzp->z_size == 2);
 }
 
 int
 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	znode_t *xzp;
 	dmu_tx_t *tx;
 	int error;
 	zfs_acl_ids_t acl_ids;
 	boolean_t fuid_dirtied;
 	uint64_t parent;
 
 	*xvpp = NULL;
 
 	/*
 	 * In FreeBSD, access checking for creating an EA is being done
 	 * in zfs_setextattr(),
 	 */
 #ifndef __FreeBSD_kernel__
 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
 		return (error);
 #endif
 
 	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
 	    &acl_ids)) != 0)
 		return (error);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
 		zfs_acl_ids_free(&acl_ids);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve(1);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		return (error);
 	}
 	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 #ifdef DEBUG
 	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent));
 	ASSERT(error == 0 && parent == zp->z_id);
 #endif
 
 	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
 	    sizeof (xzp->z_id), tx));
 
 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
 	    xzp, "", NULL, acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	*xvpp = ZTOV(xzp);
 
 	return (0);
 }
 
 /*
  * Return a znode for the extended attribute directory for zp.
  * ** If the directory does not already exist, it is created **
  *
  *	IN:	zp	- znode to obtain attribute directory from
  *		cr	- credentials of caller
  *		flags	- flags from the VOP_LOOKUP call
  *
  *	OUT:	xzpp	- pointer to extended attribute znode
  *
  *	RETURN:	0 on success
  *		error number on failure
  */
 int
 zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 {
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	znode_t		*xzp;
 	vattr_t		va;
 	int		error;
 top:
 	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xvpp = ZTOV(xzp);
 		return (0);
 	}
 
 
 	if (!(flags & CREATE_XATTR_DIR)) {
 #ifdef illumos
 		return (SET_ERROR(ENOENT));
 #else
 		return (SET_ERROR(ENOATTR));
 #endif
 	}
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * The ability to 'create' files in an attribute
 	 * directory comes from the write_xattr permission on the base file.
 	 *
 	 * The ability to 'search' an attribute directory requires
 	 * read_xattr permission on the base file.
 	 *
 	 * Once in a directory the ability to read/write attributes
 	 * is controlled by the permissions on the attribute file.
 	 */
 	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
 	va.va_type = VDIR;
 	va.va_mode = S_IFDIR | S_ISVTX | 0777;
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
 	if (error == 0)
 		VOP_UNLOCK(*xvpp, 0);
 
 	return (error);
 }
 
 /*
  * Decide whether it is okay to remove within a sticky directory.
  *
  * In sticky directories, write access is not sufficient;
  * you can remove entries from a directory only if:
  *
  *	you own the directory,
  *	you own the entry,
  *	the entry is a plain file and you have write access,
  *	or you are privileged (checked in secpolicy...).
  *
  * The function returns 0 if remove access is granted.
  */
 int
 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 {
 	uid_t  		uid;
 	uid_t		downer;
 	uid_t		fowner;
 	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
 
 	if (zdp->z_zfsvfs->z_replay)
 		return (0);
 
 	if ((zdp->z_mode & S_ISVTX) == 0)
 		return (0);
 
 	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
 	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
 
 	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
 	    (ZTOV(zp)->v_type == VREG &&
 	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
 		return (0);
 	else
 		return (secpolicy_vnode_remove(ZTOV(zp), cr));
 }
Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	(revision 329627)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	(revision 329628)
@@ -1,4151 +1,4151 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/sysmacros.h>
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
 #include <sys/trim_map.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
 #include <sys/metaslab_impl.h>
 #include <sys/abd.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 #if defined(__amd64__)
 static int zio_use_uma = 1;
 #else
 static int zio_use_uma = 0;
 #endif
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
     "Use uma(9) for ZIO allocations");
 static int zio_exclude_metadata = 0;
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
     "Exclude metadata buffers from dumps as well");
 
 zio_trim_stats_t zio_trim_stats = {
 	{ "bytes",		KSTAT_DATA_UINT64,
 	  "Number of bytes successfully TRIMmed" },
 	{ "success",		KSTAT_DATA_UINT64,
 	  "Number of successful TRIM requests" },
 	{ "unsupported",	KSTAT_DATA_UINT64,
 	  "Number of TRIM requests that failed because TRIM is not supported" },
 	{ "failed",		KSTAT_DATA_UINT64,
 	  "Number of TRIM requests that failed for reasons other than not supported" },
 };
 
 static kstat_t *zio_trim_ksp;
 
 /*
  * ==========================================================================
  * I/O type descriptions
  * ==========================================================================
  */
 const char *zio_type_name[ZIO_TYPES] = {
 	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
 	"zio_ioctl"
 };
 
 boolean_t zio_dva_throttle_enabled = B_TRUE;
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
     &zio_dva_throttle_enabled, 0, "");
 
 /*
  * ==========================================================================
  * I/O kmem caches
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
 kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
 
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
 #define	BP_SPANB(indblkshift, level) \
 	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
 #define	COMPARE_META_LEVEL	0x80000000ul
 /*
  * The following actions directly effect the spa's sync-to-convergence logic.
  * The values below define the sync pass when we start performing the action.
  * Care should be taken when changing these values as they directly impact
  * spa_sync() performance. Tuning these values may introduce subtle performance
  * pathologies and should only be done in the context of performance analysis.
  * These tunables will eventually be removed and replaced with #defines once
  * enough analysis has been done to determine optimal values.
  *
  * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  * regular blocks are not deferred.
  */
 int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
     &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
 int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
     &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
 int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
 SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
     &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
 #define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
 
 boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
 
 #ifdef illumos
 #ifdef ZFS_DEBUG
 int zio_buf_debug_limit = 16384;
 #else
 int zio_buf_debug_limit = 0;
 #endif
 #endif
 
 static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
 
 void
 zio_init(void)
 {
 	size_t c;
 	zio_cache = kmem_cache_create("zio_cache",
 	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	if (!zio_use_uma)
 		goto out;
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
 	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 	 * for each quarter-power of 2.
 	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 		size_t p2 = size;
 		size_t align = 0;
 		int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
 
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
 
 #ifdef illumos
 #ifndef _KERNEL
 		/*
 		 * If we are using watchpoints, put each buffer on its own page,
 		 * to eliminate the performance overhead of trapping to the
 		 * kernel when modifying a non-watched buffer that shares the
 		 * page with a watched buffer.
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
 #endif
 #endif /* illumos */
 		if (size <= 4 * SPA_MINBLOCKSIZE) {
 			align = SPA_MINBLOCKSIZE;
 		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
 			align = MIN(p2 >> 2, PAGESIZE);
 		}
 
 		if (align != 0) {
 			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL, cflags);
 
 			/*
 			 * Since zio_data bufs do not appear in crash dumps, we
 			 * pass KMC_NOTOUCH so that no allocator metadata is
 			 * stored with the buffers.
 			 */
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, NULL,
 			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
 		}
 	}
 
 	while (--c != 0) {
 		ASSERT(zio_buf_cache[c] != NULL);
 		if (zio_buf_cache[c - 1] == NULL)
 			zio_buf_cache[c - 1] = zio_buf_cache[c];
 
 		ASSERT(zio_data_buf_cache[c] != NULL);
 		if (zio_data_buf_cache[c - 1] == NULL)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 out:
 
 	zio_inject_init();
 
 	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
 	    KSTAT_TYPE_NAMED,
 	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zio_trim_ksp != NULL) {
 		zio_trim_ksp->ks_data = &zio_trim_stats;
 		kstat_install(zio_trim_ksp);
 	}
 }
 
 void
 zio_fini(void)
 {
 	size_t c;
 	kmem_cache_t *last_cache = NULL;
 	kmem_cache_t *last_data_cache = NULL;
 
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		if (zio_buf_cache[c] != last_cache) {
 			last_cache = zio_buf_cache[c];
 			kmem_cache_destroy(zio_buf_cache[c]);
 		}
 		zio_buf_cache[c] = NULL;
 
 		if (zio_data_buf_cache[c] != last_data_cache) {
 			last_data_cache = zio_data_buf_cache[c];
 			kmem_cache_destroy(zio_data_buf_cache[c]);
 		}
 		zio_data_buf_cache[c] = NULL;
 	}
 
 	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
 
 	if (zio_trim_ksp != NULL) {
 		kstat_delete(zio_trim_ksp);
 		zio_trim_ksp = NULL;
 	}
 }
 
 /*
  * ==========================================================================
  * Allocate and free I/O buffers
  * ==========================================================================
  */
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
  * excess / transient data in-core during a crashdump.
  */
 void *
 zio_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 	else
 		return (kmem_alloc(size, KM_SLEEP|flags));
 }
 
 /*
  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
  * crashdump if the kernel panics.  This exists so that we will limit the amount
  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
  * of kernel heap dumped to disk when the kernel panics)
  */
 void *
 zio_data_buf_alloc(size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 	else
 		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
 }
 
 void
 zio_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_buf_cache[c], buf);
 	else
 		kmem_free(buf, size);
 }
 
 void
 zio_data_buf_free(void *buf, size_t size)
 {
 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	if (zio_use_uma)
 		kmem_cache_free(zio_data_buf_cache[c], buf);
 	else
 		kmem_free(buf, size);
 }
 
 /*
  * ==========================================================================
  * Push and pop I/O transform buffers
  * ==========================================================================
  */
 void
 zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
     zio_transform_func_t *transform)
 {
 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 
 	/*
 	 * Ensure that anyone expecting this zio to contain a linear ABD isn't
 	 * going to get a nasty surprise when they try to access the data.
 	 */
 #ifdef illumos
 	IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
 #else
 	IMPLY(zio->io_abd != NULL && abd_is_linear(zio->io_abd),
 	    abd_is_linear(data));
 #endif
 
 	zt->zt_orig_abd = zio->io_abd;
 	zt->zt_orig_size = zio->io_size;
 	zt->zt_bufsize = bufsize;
 	zt->zt_transform = transform;
 
 	zt->zt_next = zio->io_transform_stack;
 	zio->io_transform_stack = zt;
 
 	zio->io_abd = data;
 	zio->io_size = size;
 }
 
 void
 zio_pop_transforms(zio_t *zio)
 {
 	zio_transform_t *zt;
 
 	while ((zt = zio->io_transform_stack) != NULL) {
 		if (zt->zt_transform != NULL)
 			zt->zt_transform(zio,
 			    zt->zt_orig_abd, zt->zt_orig_size);
 
 		if (zt->zt_bufsize != 0)
 			abd_free(zio->io_abd);
 
 		zio->io_abd = zt->zt_orig_abd;
 		zio->io_size = zt->zt_orig_size;
 		zio->io_transform_stack = zt->zt_next;
 
 		kmem_free(zt, sizeof (zio_transform_t));
 	}
 }
 
 /*
  * ==========================================================================
  * I/O transform callbacks for subblocks and decompression
  * ==========================================================================
  */
 static void
 zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
 {
 	ASSERT(zio->io_size > size);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		abd_copy(data, zio->io_abd, size);
 }
 
 static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
 		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 		    zio->io_abd, tmp, zio->io_size, size);
 		abd_return_buf_copy(data, tmp, size);
 
 		if (ret != 0)
 			zio->io_error = SET_ERROR(EIO);
 	}
 }
 
 /*
  * ==========================================================================
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
 zio_t *
 zio_walk_parents(zio_t *cio, zio_link_t **zl)
 {
 	list_t *pl = &cio->io_parent_list;
 
 	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_child == cio);
 	return ((*zl)->zl_parent);
 }
 
 zio_t *
 zio_walk_children(zio_t *pio, zio_link_t **zl)
 {
 	list_t *cl = &pio->io_child_list;
 
 	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
 	if (*zl == NULL)
 		return (NULL);
 
 	ASSERT((*zl)->zl_parent == pio);
 	return ((*zl)->zl_child);
 }
 
 zio_t *
 zio_unique_parent(zio_t *cio)
 {
 	zio_link_t *zl = NULL;
 	zio_t *pio = zio_walk_parents(cio, &zl);
 
 	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
 	return (pio);
 }
 
 void
 zio_add_child(zio_t *pio, zio_t *cio)
 {
 	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 
 	/*
 	 * Logical I/Os can have logical, gang, or vdev children.
 	 * Gang I/Os can have gang or vdev children.
 	 * Vdev I/Os can only have vdev children.
 	 * The following ASSERT captures all of these constraints.
 	 */
 	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
 
 	zl->zl_parent = pio;
 	zl->zl_child = cio;
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 
 	list_insert_head(&pio->io_child_list, zl);
 	list_insert_head(&cio->io_parent_list, zl);
 
 	pio->io_child_count++;
 	cio->io_parent_count++;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 }
 
 static void
 zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
 	ASSERT(zl->zl_parent == pio);
 	ASSERT(zl->zl_child == cio);
 
 	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
 
 	list_remove(&pio->io_child_list, zl);
 	list_remove(&cio->io_parent_list, zl);
 
 	pio->io_child_count--;
 	cio->io_parent_count--;
 
 	mutex_exit(&pio->io_lock);
 	mutex_exit(&cio->io_lock);
 
 	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
 zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
 {
 	boolean_t waiting = B_FALSE;
 
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
 		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
 			continue;
 
 		uint64_t *countp = &zio->io_children[c][wait];
 		if (*countp != 0) {
 			zio->io_stage >>= 1;
 			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
 			zio->io_stall = countp;
 			waiting = B_TRUE;
 			break;
 		}
 	}
 	mutex_exit(&zio->io_lock);
 	return (waiting);
 }
 
 static void
 zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 {
 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 	int *errorp = &pio->io_child_error[zio->io_child_type];
 
 	mutex_enter(&pio->io_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		*errorp = zio_worst_error(*errorp, zio->io_error);
 	pio->io_reexecute |= zio->io_reexecute;
 	ASSERT3U(*countp, >, 0);
 
 	(*countp)--;
 
 	if (*countp == 0 && pio->io_stall == countp) {
 		zio_taskq_type_t type =
 		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
 		    ZIO_TASKQ_INTERRUPT;
 		pio->io_stall = NULL;
 		mutex_exit(&pio->io_lock);
 		/*
 		 * Dispatch the parent zio in its own taskq so that
 		 * the child can continue to make progress. This also
 		 * prevents overflowing the stack when we have deeply nested
 		 * parent-child relationships.
 		 */
 		zio_taskq_dispatch(pio, type, B_FALSE);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
 static void
 zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 {
 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 		zio->io_error = zio->io_child_error[c];
 }
 
 int
 zio_bookmark_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = x1;
 	const zio_t *z2 = x2;
 
 	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
 		return (-1);
 	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
 		return (1);
 
 	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
 		return (-1);
 	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
 		return (1);
 
 	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
 		return (-1);
 	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
 		return (1);
 
 	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
 		return (-1);
 	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
 		return (1);
 
 	if (z1 < z2)
 		return (-1);
 	if (z1 > z2)
 		return (1);
 
 	return (0);
 }
 
 /*
  * ==========================================================================
  * Create the various types of I/O (read, write, free, etc)
  * ==========================================================================
  */
 static zio_t *
 zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
     void *private, zio_type_t type, zio_priority_t priority,
     enum zio_flag flags, vdev_t *vd, uint64_t offset,
     const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
 
 	ASSERT3U(type == ZIO_TYPE_FREE || psize, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 
 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
 
 	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0);
 
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
 	list_create(&zio->io_parent_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_parent_node));
 	list_create(&zio->io_child_list, sizeof (zio_link_t),
 	    offsetof(zio_link_t, zl_child_node));
 	metaslab_trace_init(&zio->io_alloc_list);
 
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
 	else if (flags & ZIO_FLAG_DDT_CHILD)
 		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
 		zio->io_bp = (blkptr_t *)bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
 		if (type != ZIO_TYPE_WRITE ||
 		    zio->io_child_type == ZIO_CHILD_DDT)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
 		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
 		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
 	zio->io_orig_abd = zio->io_abd = data;
 	zio->io_orig_size = zio->io_size = psize;
 	zio->io_lsize = lsize;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
 
 	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		if (zio->io_child_type == ZIO_CHILD_GANG)
 			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
 	}
 
 	return (zio);
 }
 
 static void
 zio_destroy(zio_t *zio)
 {
 	metaslab_trace_fini(&zio->io_alloc_list);
 	list_destroy(&zio->io_parent_list);
 	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
 }
 
 zio_t *
 zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
     void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 void
 zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 {
 	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 		zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 		    bp, (longlong_t)BP_GET_TYPE(bp));
 	}
 	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 		zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
 	}
 	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 		zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 		    bp, (longlong_t)BP_GET_COMPRESS(bp));
 	}
 	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 		    bp, (longlong_t)BP_GET_LSIZE(bp));
 	}
 	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 		zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 		    bp, (longlong_t)BP_GET_PSIZE(bp));
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 			zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 			    bp, (longlong_t)BPE_GET_ETYPE(bp));
 		}
 	}
 
 	/*
 	 * Pool-specific checks.
 	 *
 	 * Note: it would be nice to verify that the blk_birth and
 	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
 	 * that are in the log) to be arbitrarily large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (vdevid >= spa->spa_root_vdev->vdev_children) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 		if (vd == NULL) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_hole_ops) {
 			zfs_panic_recover("blkptr at %p DVA %u has hole "
 			    "VDEV %llu",
 			    bp, i, (longlong_t)vdevid);
 			continue;
 		}
 		if (vd->vdev_ops == &vdev_missing_ops) {
 			/*
 			 * "missing" vdevs are valid during import, but we
 			 * don't have their detailed info (e.g. asize), so
 			 * we can't perform any more checks on them.
 			 */
 			continue;
 		}
 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 		if (BP_IS_GANG(bp))
 			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 		if (offset + asize > vd->vdev_asize) {
 			zfs_panic_recover("blkptr at %p DVA %u has invalid "
 			    "OFFSET %llu",
 			    bp, i, (longlong_t)offset);
 		}
 	}
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zfs_blkptr_verify(spa, bp);
 
 	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *physdone, zio_done_func_t *done,
     void *private, zio_priority_t priority, enum zio_flag flags,
     const zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
 	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_children_ready = children_ready;
 	zio->io_physdone = physdone;
 	zio->io_prop = *zp;
 
 	/*
 	 * Data can be NULL if we are going to call zio_write_override() to
 	 * provide the already-allocated BP.  But we may need the data to
 	 * verify a dedup hit (if requested).  In this case, don't try to
 	 * dedup (just take the already-allocated BP verbatim).
 	 */
 	if (data == NULL && zio->io_prop.zp_dedup_verify) {
 		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
 	/*
 	 * We must reset the io_prop to match the values that existed
 	 * when the bp was first written by dmu_sync() keeping in mind
 	 * that nopwrite and dedup are mutually exclusive.
 	 */
 	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
 
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
 	/*
 	 * The check for EMBEDDED is a performance optimization.  We
 	 * process the free here (by ignoring it) rather than
 	 * putting it on the list and then processing it in zio_free_sync().
 	 */
 	if (BP_IS_EMBEDDED(bp))
 		return;
 	metaslab_check_free(spa, bp);
 
 	/*
 	 * Frees that are for the currently-syncing txg, are not going to be
 	 * deferred, and which will not need to do a read (i.e. not GANG or
 	 * DEDUP), can be processed immediately.  Otherwise, put them on the
 	 * in-memory list for later processing.
 	 */
 	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 	    txg != spa->spa_syncing_txg ||
 	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 	} else {
 		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
 		    BP_GET_PSIZE(bp), 0)));
 	}
 }
 
 zio_t *
 zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     uint64_t size, enum zio_flag flags)
 {
 	zio_t *zio;
 	enum zio_stage stage = ZIO_FREE_PIPELINE;
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
 	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	metaslab_check_free(spa, bp);
 	arc_freed(spa, bp);
 
 	if (zfs_trim_enabled)
 		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
 		    ZIO_STAGE_VDEV_IO_ASSESS;
 	/*
 	 * GANG and DEDUP blocks can induce a read (for the gang block header,
 	 * or the DDT), so issue them asynchronously so that this thread is
 	 * not tied up.
 	 */
 	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 		stage |= ZIO_STAGE_ISSUE_ASYNC;
 
 	flags |= ZIO_FLAG_DONT_QUEUE;
 
 	zio = zio_create(pio, spa, txg, bp, NULL, size,
 	    size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 
 	return (zio);
 }
 
 zio_t *
 zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	dprintf_bp(bp, "claiming in txg %llu", txg);
 
 	if (BP_IS_EMBEDDED(bp))
 		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 
 	/*
 	 * A claim is an allocation of a specific block.  Claims are needed
 	 * to support immediate writes in the intent log.  The issue is that
 	 * immediate writes contain committed data, but in a txg that was
 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
 	 * the intent log claims all blocks that contain immediate write data
 	 * so that the SPA knows they're in use.
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT(txg == spa_first_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
 	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 	ASSERT0(zio->io_queued_timestamp);
 
 	return (zio);
 }
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
     uint64_t size, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
 
 	if (vd->vdev_children == 0) {
 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
 		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 
 		zio->io_cmd = cmd;
 	} else {
 		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 			    offset, size, done, private, priority, flags));
 	}
 
 	return (zio);
 }
 
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	return (zio);
 }
 
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     abd_t *data, int checksum, zio_done_func_t *done, void *private,
     zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_children == 0);
 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 	ASSERT3U(offset + size, <=, vd->vdev_psize);
 
 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
 	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
 	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
 	zio->io_prop.zp_checksum = checksum;
 
 	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
 		 */
 		abd_t *wbuf = abd_alloc_sametype(data, size);
 		abd_copy(wbuf, data, size);
 
 		zio_push_transform(zio, wbuf, size, size, NULL);
 	}
 
 	return (zio);
 }
 
 /*
  * Create a child I/O to do some work for us.
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
     abd_t *data, uint64_t size, int type, zio_priority_t priority,
     enum zio_flag flags, zio_done_func_t *done, void *private)
 {
 	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	ASSERT(vd->vdev_parent ==
 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 
 	if (type == ZIO_TYPE_READ && bp != NULL) {
 		/*
 		 * If we have the bp, then the child should perform the
 		 * checksum and the parent need not.  This pushes error
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
 		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	/* Not all IO types require vdev io done stage e.g. free */
 	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
 		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
 
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
 	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 
 	/*
 	 * If we've decided to do a repair, the write is not speculative --
 	 * even if the original read was.
 	 */
 	if (flags & ZIO_FLAG_IO_REPAIR)
 		flags &= ~ZIO_FLAG_SPECULATIVE;
 
 	/*
 	 * If we're creating a child I/O that is not associated with a
 	 * top-level vdev, then the child zio is not an allocating I/O.
 	 * If this is a retried I/O then we ignore it since we will
 	 * have already processed the original allocating I/O.
 	 */
 	if (flags & ZIO_FLAG_IO_ALLOCATING &&
 	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
 		metaslab_class_t *mc = spa_normal_class(pio->io_spa);
 
 		ASSERT(mc->mc_alloc_throttle_enabled);
 		ASSERT(type == ZIO_TYPE_WRITE);
 		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
 		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
 		    pio->io_child_type == ZIO_CHILD_GANG);
 
 		flags &= ~ZIO_FLAG_IO_ALLOCATING;
 	}
 
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
 	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 
 	zio->io_physdone = pio->io_physdone;
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 		zio->io_logical->io_phys_children++;
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
     int type, zio_priority_t priority, enum zio_flag flags,
     zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 	    data, size, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
 	    vd, offset, NULL,
 	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
 
 void
 zio_flush(zio_t *zio, vdev_t *vd)
 {
 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
 	    NULL, NULL, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
 zio_t *
 zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
 {
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	return (zio_create(zio, spa, 0, NULL, NULL, size, size, NULL, NULL,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
 	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
 }
 
 void
 zio_shrink(zio_t *zio, uint64_t size)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 	ASSERT3P(zio->io_orig_size, ==, zio->io_size);
 	ASSERT3U(size, <=, zio->io_size);
 
 	/*
 	 * We don't shrink for raidz because of problems with the
 	 * reconstruction when reading back less than the block size.
 	 * Note, BP_IS_RAIDZ() assumes no compression.
 	 */
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 	if (!BP_IS_RAIDZ(zio->io_bp)) {
 		/* we are not doing a raw write */
 		ASSERT3U(zio->io_size, ==, zio->io_lsize);
 		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
 	}
 }
 
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
  * ==========================================================================
  */
 
 static int
 zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    !(zio->io_flags & ZIO_FLAG_RAW)) {
 		uint64_t psize =
 		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
 		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
 		    psize, psize, zio_decompress);
 	}
 
 	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		int psize = BPE_GET_PSIZE(bp);
 		void *data = abd_borrow_buf(zio->io_abd, psize);
 		decode_embedded_bp_compressed(bp, data);
 		abd_return_buf_copy(zio->io_abd, data, psize);
 	} else {
 		ASSERT(!BP_IS_EMBEDDED(bp));
 	}
 
 	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 
 	if (zio->io_bp_override) {
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
 		ASSERT(bp->blk_birth != zio->io_txg);
 		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (BP_IS_EMBEDDED(bp))
 			return (ZIO_PIPELINE_CONTINUE);
 
 		/*
 		 * If we've been overridden and nopwrite is set then
 		 * set the flag accordingly to indicate that a nopwrite
 		 * has already occurred.
 		 */
 		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
 			ASSERT(!zp->zp_dedup);
 			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
 			zio->io_flags |= ZIO_FLAG_NOPWRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		ASSERT(!zp->zp_nopwrite);
 
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
 			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_compress(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
 	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t lsize = zio->io_lsize;
 	uint64_t psize = zio->io_size;
 	int pass = 1;
 
 	EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0);
 
 	/*
 	 * If our children haven't all reached the ready stage,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
 	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	if (zio->io_children_ready != NULL) {
 		/*
 		 * Now that all our children are ready, run the callback
 		 * associated with this zio in case it wants to modify the
 		 * data to be written.
 		 */
 		ASSERT3U(zp->zp_level, >, 0);
 		zio->io_children_ready(zio);
 	}
 
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
 		 * converge, it must eventually be the case that we don't
 		 * have to allocate new blocks.  But compression changes
 		 * the blocksize, which forces a reallocate, and makes
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
 		pass = spa_sync_pass(spa);
 
 		ASSERT(zio->io_txg == spa_syncing_txg(spa));
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
 		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
 		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
 		void *cbuf = zio_buf_alloc(lsize);
 		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
 		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
 			zio_buf_free(cbuf, lsize);
 		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
 			bp->blk_birth = zio->io_txg;
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
 			return (ZIO_PIPELINE_CONTINUE);
 		} else {
 			/*
 			 * Round up compressed size up to the ashift
 			 * of the smallest-ashift device, and zero the tail.
 			 * This ensures that the compressed size of the BP
 			 * (and thus compressratio property) are correct,
 			 * in that we charge for the padding used to fill out
 			 * the last sector.
 			 */
 			ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 			size_t rounded = (size_t)P2ROUNDUP(psize,
 			    1ULL << spa->spa_min_ashift);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
 				zio_buf_free(cbuf, lsize);
 				psize = lsize;
 			} else {
 				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
 				abd_take_ownership_of_buf(cdata, B_TRUE);
 				abd_zero_off(cdata, psize, rounded - psize);
 				psize = rounded;
 				zio_push_transform(zio, cdata,
 				    psize, lsize, NULL);
 			}
 		}
 
 		/*
 		 * We were unable to handle this as an override bp, treat
 		 * it as a regular write I/O.
 		 */
 		zio->io_bp_override = NULL;
 		*bp = zio->io_bp_orig;
 		zio->io_pipeline = zio->io_orig_pipeline;
 	} else {
 		ASSERT3U(psize, !=, 0);
 	}
 
 	/*
 	 * The final pass of spa_sync() must be all rewrites, but the first
 	 * few passes offer a trade-off: allocating blocks defers convergence,
 	 * but newly allocated blocks are sequential, so they can be written
 	 * to disk faster.  Therefore, we allow the first few passes of
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
 		BP_ZERO(bp);
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
 	if (psize == 0) {
 		if (zio->io_bp_orig.blk_birth != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
 			BP_SET_LEVEL(bp, zp->zp_level);
 			BP_SET_BIRTH(bp, zio->io_txg, 0);
 		}
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
 		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 		if (zp->zp_dedup) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
 		if (zp->zp_nopwrite) {
 			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
 		}
 	}
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_free_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
 		if (BP_GET_DEDUP(bp))
 			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Execute the I/O pipeline
  * ==========================================================================
  */
 
 static void
 zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
 {
 	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
 	int flags = (cutinline ? TQ_FRONT : 0);
 
 	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
 
 	/*
 	 * If we're a config writer or a probe, the normal issue and
 	 * interrupt threads may all be blocked waiting for the config lock.
 	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
 	 */
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
 	/*
 	 * If this is a high priority I/O, then use the high priority taskq if
 	 * available.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
 	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
 		q++;
 
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 #if defined(illumos) || !defined(_KERNEL)
 	ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
 	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
 	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
 	    flags, &zio->io_tqent);
 }
 
 static boolean_t
 zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
 {
 	kthread_t *executor = zio->io_executor;
 	spa_t *spa = zio->io_spa;
 
 	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
 		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 		uint_t i;
 		for (i = 0; i < tqs->stqs_count; i++) {
 			if (taskq_member(tqs->stqs_taskq[i], executor))
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static int
 zio_issue_async(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 void
 zio_interrupt(zio_t *zio)
 {
 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 void
 zio_delay_interrupt(zio_t *zio)
 {
 	/*
 	 * The timeout_generic() function isn't defined in userspace, so
 	 * rather than trying to implement the function, the zio delay
 	 * functionality has been disabled for userspace builds.
 	 */
 
 #ifdef _KERNEL
 	/*
 	 * If io_target_timestamp is zero, then no delay has been registered
 	 * for this IO, thus jump to the end of this function and "skip" the
 	 * delay; issuing it directly to the zio layer.
 	 */
 	if (zio->io_target_timestamp != 0) {
 		hrtime_t now = gethrtime();
 
 		if (now >= zio->io_target_timestamp) {
 			/*
 			 * This IO has already taken longer than the target
 			 * delay to complete, so we don't want to delay it
 			 * any longer; we "miss" the delay and issue it
 			 * directly to the zio layer. This is likely due to
 			 * the target latency being set to a value less than
 			 * the underlying hardware can satisfy (e.g. delay
 			 * set to 1ms, but the disks take 10ms to complete an
 			 * IO request).
 			 */
 
 			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
 			    hrtime_t, now);
 
 			zio_interrupt(zio);
 		} else {
 			hrtime_t diff = zio->io_target_timestamp - now;
 
 			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
 			    hrtime_t, now, hrtime_t, diff);
 
 			(void) timeout_generic(CALLOUT_NORMAL,
 			    (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
 		}
 
 		return;
 	}
 #endif
 
 	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
 	zio_interrupt(zio);
 }
 
 /*
  * Execute the I/O pipeline until one of the following occurs:
  *
  *	(1) the I/O completes
  *	(2) the pipeline stalls waiting for dependent child I/Os
  *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
  *	(4) the I/O is delegated by vdev-level caching or aggregation
  *	(5) the I/O is deferred due to vdev-level queueing
  *	(6) the I/O is handed off to another thread.
  *
  * In all cases, the pipeline stops whenever there's no CPU work; it never
  * burns a thread in cv_wait().
  *
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
 static zio_pipe_stage_t *zio_pipeline[];
 
 void
 zio_execute(zio_t *zio)
 {
 	zio->io_executor = curthread;
 
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
 		enum zio_stage pipeline = zio->io_pipeline;
 		enum zio_stage stage = zio->io_stage;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
 		ASSERT(ISP2(stage));
 		ASSERT(zio->io_stall == NULL);
 
 		do {
 			stage <<= 1;
 		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
 		 * or may wait for an I/O that needs an interrupt thread
 		 * to complete, issue async to avoid deadlock.
 		 *
 		 * For VDEV_IO_START, we cut in line so that the io will
 		 * be sent to disk promptly.
 		 */
 		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
 			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
 			    zio_requeue_io_start_cut_in_line : B_FALSE;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
 		zio->io_pipeline_trace |= zio->io_stage;
 		rv = zio_pipeline[highbit64(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
 
 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
 }
 
 /*
  * ==========================================================================
  * Initiate I/O, either sync or async
  * ==========================================================================
  */
 int
 zio_wait(zio_t *zio)
 {
 	int error;
 
 	ASSERT3P(zio->io_stage, ==, ZIO_STAGE_OPEN);
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	zio->io_waiter = curthread;
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 
 	zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_executor != NULL)
 		cv_wait(&zio->io_cv, &zio->io_lock);
 	mutex_exit(&zio->io_lock);
 
 	error = zio->io_error;
 	zio_destroy(zio);
 
 	return (error);
 }
 
 void
 zio_nowait(zio_t *zio)
 {
 	ASSERT3P(zio->io_executor, ==, NULL);
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    zio_unique_parent(zio) == NULL) {
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
 
 		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}
 
 	ASSERT0(zio->io_queued_timestamp);
 	zio->io_queued_timestamp = gethrtime();
 	zio_execute(zio);
 }
 
 /*
  * ==========================================================================
  * Reexecute, cancel, or suspend/resume failed I/O
  * ==========================================================================
  */
 
 static void
 zio_reexecute(zio_t *pio)
 {
 	zio_t *cio, *cio_next;
 
 	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 	ASSERT(pio->io_gang_leader == NULL);
 	ASSERT(pio->io_gang_tree == NULL);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_pipeline_trace = 0;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
 	if (IO_IS_ALLOCATING(pio))
 		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
 	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
 	 * the remainder of pio's io_child_list, from 'cio_next' onward,
 	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
 	zio_link_t *zl = NULL;
 	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
 		cio_next = zio_walk_children(pio, &zl);
 		mutex_enter(&pio->io_lock);
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
 		zio_reexecute(cio);
 	}
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
 	 * We don't reexecute "The Godfather" I/O here as it's the
-	 * responsibility of the caller to wait on him.
+	 * responsibility of the caller to wait on it.
 	 */
 	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
 		pio->io_queued_timestamp = gethrtime();
 		zio_execute(pio);
 	}
 }
 
 void
 zio_suspend(spa_t *spa, zio_t *zio)
 {
 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
 		    "failure and the failure mode property for this pool "
 		    "is set to panic.", spa_name(spa));
 
 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
 
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = B_TRUE;
 
 	if (zio != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
 
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 int
 zio_resume(spa_t *spa)
 {
 	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
 	 */
 	mutex_enter(&spa->spa_suspend_lock);
 	spa->spa_suspended = B_FALSE;
 	cv_broadcast(&spa->spa_suspend_cv);
 	pio = spa->spa_suspend_zio_root;
 	spa->spa_suspend_zio_root = NULL;
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
 		return (0);
 
 	zio_reexecute(pio);
 	return (zio_wait(pio));
 }
 
 void
 zio_resume_wait(spa_t *spa)
 {
 	mutex_enter(&spa->spa_suspend_lock);
 	while (spa_suspended(spa))
 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
 /*
  * ==========================================================================
  * Gang blocks.
  *
  * A gang block is a collection of small blocks that looks to the DMU
  * like one large block.  When zio_dva_allocate() cannot find a block
  * of the requested size, due to either severe fragmentation or the pool
  * being nearly full, it calls zio_write_gang_block() to construct the
  * block from smaller fragments.
  *
  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
  * an indirect block: it's an array of block pointers.  It consumes
  * only one sector and hence is allocatable regardless of fragmentation.
  * The gang header's bps point to its gang members, which hold the data.
  *
  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
  * as the verifier to ensure uniqueness of the SHA256 checksum.
  * Critically, the gang block bp's blk_cksum is the checksum of the data,
  * not the gang header.  This ensures that data block signatures (needed for
  * deduplication) are independent of how the block is physically stored.
  *
  * Gang blocks can be nested: a gang member may itself be a gang block.
  * Thus every gang block is a tree in which root and all interior nodes are
  * gang headers, and the leaves are normal blocks that contain user data.
  * The root of the gang tree is called the gang leader.
  *
  * To perform any operation (read, rewrite, free, claim) on a gang block,
  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
  * in the io_gang_tree field of the original logical i/o by recursively
  * reading the gang leader and all gang headers below it.  This yields
  * an in-core tree containing the contents of every gang header and the
  * bps for every constituent of the gang block.
  *
  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
  * of the gang header plus zio_checksum_compute() of the data to update the
  * gang header's blk_cksum as described above.
  *
  * The two-phase assemble/issue model solves the problem of partial failure --
  * what if you'd freed part of a gang block but then couldn't read the
  * gang header for another part?  Assembling the entire gang tree first
  * ensures that all the necessary gang header I/O has succeeded before
  * starting the actual work of free, claim, or write.  Once the gang tree
  * is assembled, free and claim are in-memory operations that cannot fail.
  *
  * In the event that a gang write fails, zio_dva_unallocate() walks the
  * gang tree to immediately free (i.e. insert back into the space map)
  * everything we've allocated.  This ensures that we don't get ENOSPC
  * errors during repeated suspend/resume cycles due to a flaky device.
  *
  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
  * the gang tree, we won't modify the block, so we can safely defer the free
  * (knowing that the block is still intact).  If we *can* assemble the gang
  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
  * each constituent bp and we can allocate a new block on the next sync pass.
  *
  * In all cases, the gang tree allows complete recovery from partial failure.
  * ==========================================================================
  */
 
 static void
 zio_gang_issue_func_done(zio_t *zio)
 {
 	abd_put(zio->io_abd);
 }
 
 static zio_t *
 zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	if (gn != NULL)
 		return (pio);
 
 	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
 	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
 	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 	    &pio->io_bookmark));
 }
 
 static zio_t *
 zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	zio_t *zio;
 
 	if (gn != NULL) {
 		abd_t *gbh_abd =
 		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
 		    &pio->io_bookmark);
 		/*
 		 * As we rewrite each gang header, the pipeline will compute
 		 * a new gang block header checksum for it; but no one will
 		 * compute a new data checksum, so we do that here.  The one
 		 * exception is the gang leader: the pipeline already computed
 		 * its data checksum because that stage precedes gang assembly.
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
 		if (gn != pio->io_gang_leader->io_gang_tree) {
 			abd_t *buf = abd_get_offset(data, offset);
 
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    buf, BP_GET_PSIZE(bp));
 
 			abd_put(buf);
 		}
 		/*
 		 * If we are here to damage data for testing purposes,
 		 * leave the GBH alone so that we can detect the damage.
 		 */
 		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
 		    zio_gang_issue_func_done, NULL, pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 	}
 
 	return (zio);
 }
 
 /* ARGSUSED */
 static zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
 	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
 	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
 static zio_t *
 zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
     uint64_t offset)
 {
 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
 	NULL,
 	zio_read_gang,
 	zio_rewrite_gang,
 	zio_free_gang,
 	zio_claim_gang,
 	NULL
 };
 
 static void zio_gang_tree_assemble_done(zio_t *zio);
 
 static zio_gang_node_t *
 zio_gang_node_alloc(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn;
 
 	ASSERT(*gnpp == NULL);
 
 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
 	*gnpp = gn;
 
 	return (gn);
 }
 
 static void
 zio_gang_node_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		ASSERT(gn->gn_child[g] == NULL);
 
 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 	kmem_free(gn, sizeof (*gn));
 	*gnpp = NULL;
 }
 
 static void
 zio_gang_tree_free(zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = *gnpp;
 
 	if (gn == NULL)
 		return;
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
 		zio_gang_tree_free(&gn->gn_child[g]);
 
 	zio_gang_node_free(gnpp);
 }
 
 static void
 zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
 
 	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
 	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_gang_tree_assemble_done, gn, gio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
 	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(gio == zio_unique_parent(zio));
 	ASSERT(zio->io_child_count == 0);
 
 	if (zio->io_error)
 		return;
 
 	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
 	if (BP_SHOULD_BYTESWAP(bp))
 		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
 
 	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
 	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	abd_put(zio->io_abd);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
 		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
     uint64_t offset)
 {
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
 	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
 
 	if (gn != NULL) {
 		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 			if (BP_IS_HOLE(gbp))
 				continue;
 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
 			    offset);
 			offset += BP_GET_PSIZE(gbp);
 		}
 	}
 
 	if (gn == gio->io_gang_tree && gio->io_abd != NULL)
 		ASSERT3U(gio->io_size, ==, offset);
 
 	if (zio != pio)
 		zio_nowait(zio);
 }
 
 static int
 zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_gang_issue(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
 		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
 		    0);
 	else
 		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
 	zio_t *pio = zio_unique_parent(zio);
 	zio_t *gio = zio->io_gang_leader;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
 
 	if (BP_IS_HOLE(zio->io_bp))
 		return;
 
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
 	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
 	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
 	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
 		ASSERT(DVA_GET_GANG(&pdva[d]));
 		asize = DVA_GET_ASIZE(&pdva[d]);
 		asize += DVA_GET_ASIZE(&cdva[d]);
 		DVA_SET_ASIZE(&pdva[d], asize);
 	}
 	mutex_exit(&pio->io_lock);
 }
 
 static void
 zio_write_gang_done(zio_t *zio)
 {
 	abd_put(zio->io_abd);
 }
 
 static int
 zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = pio->io_bp;
 	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	abd_t *gbh_abd;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
 	int copies = gio->io_prop.zp_copies;
 	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
 	zio_prop_t zp;
 	int error;
 
 	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
 	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 		flags |= METASLAB_ASYNC_ALLOC;
 		VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
 
 		/*
 		 * The logical zio has already placed a reservation for
 		 * 'copies' allocation slots but gang blocks may require
 		 * additional copies. These additional copies
 		 * (i.e. gbh_copies - copies) are guaranteed to succeed
 		 * since metaslab_class_throttle_reserve() always allows
 		 * additional reservations for gang blocks.
 		 */
 		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
 		    pio, flags));
 	}
 
 	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
 	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
 	    &pio->io_alloc_list, pio);
 	if (error) {
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 			/*
 			 * If we failed to allocate the gang block header then
 			 * we remove any additional allocation reservations that
 			 * we placed here. The original reservation will
 			 * be removed when the logical I/O goes to the ready
 			 * stage.
 			 */
 			metaslab_class_throttle_unreserve(mc,
 			    gbh_copies - copies, pio);
 		}
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (pio == gio) {
 		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
 	}
 
 	gn = zio_gang_node_alloc(gnpp);
 	gbh = gn->gn_gbh;
 	bzero(gbh, SPA_GANGBLOCKSIZE);
 	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
 
 	/*
 	 * Create the gang header.
 	 */
 	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
 	    zio_write_gang_done, NULL, pio->io_priority,
 	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 	/*
 	 * Create and nowait the gang children.
 	 */
 	for (int g = 0; resid != 0; resid -= lsize, g++) {
 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
 		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
 		zp.zp_dedup = B_FALSE;
 		zp.zp_dedup_verify = B_FALSE;
 		zp.zp_nopwrite = B_FALSE;
 
 		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
 		    lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
 		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
 
 		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
 
 			/*
 			 * Gang children won't throttle but we should
 			 * account for their work, so reserve an allocation
 			 * slot for them here.
 			 */
 			VERIFY(metaslab_class_throttle_reserve(mc,
 			    zp.zp_copies, cio, flags));
 		}
 		zio_nowait(cio);
 	}
 
 	/*
 	 * Set pio's pipeline to just wait for zio to finish.
 	 */
 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	zio_nowait(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * The zio_nop_write stage in the pipeline determines if allocating a
  * new bp is necessary.  The nopwrite feature can handle writes in
  * either syncing or open context (i.e. zil writes) and as a result is
  * mutually exclusive with dedup.
  *
  * By leveraging a cryptographically secure checksum, such as SHA256, we
  * can compare the checksums of the new data and the old to determine if
  * allocating a new block is required.  Note that our requirements for
  * cryptographic strength are fairly weak: there can't be any accidental
  * hash collisions, but we don't need to be secure against intentional
  * (malicious) collisions.  To trigger a nopwrite, you have to be able
  * to write the file to begin with, and triggering an incorrect (hash
  * collision) nopwrite is no worse than simply writing to the file.
  * That said, there are no known attacks against the checksum algorithms
  * used for nopwrite, assuming that the salt and the checksums
  * themselves remain secret.
  */
 static int
 zio_nop_write(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	zio_prop_t *zp = &zio->io_prop;
 
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(zp->zp_nopwrite);
 	ASSERT(!zp->zp_dedup);
 	ASSERT(zio->io_bp_override == NULL);
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Check to see if the original bp and the new bp have matching
 	 * characteristics (i.e. same checksum, compression algorithms, etc).
 	 * If they don't then just continue with the pipeline which will
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
 	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
 	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
 	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
 		return (ZIO_PIPELINE_CONTINUE);
 
 	/*
 	 * If the checksums match then reset the pipeline so that we
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
 		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
 		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
 		    sizeof (uint64_t)) == 0);
 
 		*bp = *bp_orig;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 		zio->io_flags |= ZIO_FLAG_NOPWRITE;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Dedup
  * ==========================================================================
  */
 static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddp = ddt_phys_select(dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
 	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
 		dde->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
 }
 
 static int
 zio_ddt_read_start(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
 		ddt_phys_t *ddp = dde->dde_phys;
 		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
 		if (ddp_self == NULL)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
 			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
 			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
 			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
 		}
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	zio_nowait(zio_read(zio, zio->io_spa, bp,
 	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
 	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_ddt_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = zio->io_vsd;
 		if (ddt == NULL) {
 			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 		if (dde == NULL) {
 			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (ZIO_PIPELINE_STOP);
 		}
 		if (dde->dde_repair_abd != NULL) {
 			abd_copy(zio->io_abd, dde->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
 		ddt_repair_done(ddt, dde);
 		zio->io_vsd = NULL;
 	}
 
 	ASSERT(zio->io_vsd == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static boolean_t
 zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 {
 	spa_t *spa = zio->io_spa;
 	boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW);
 
 	/* We should never get a raw, override zio */
 	ASSERT(!(zio->io_bp_override && do_raw));
 
 	/*
 	 * Note: we compare the original data, not the transformed data,
 	 * because when zio->io_bp is an override bp, we will not have
 	 * pushed the I/O transforms.  That's an important optimization
 	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
 	 */
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL) {
 			return (lio->io_orig_size != zio->io_orig_size ||
 			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
 			    zio->io_orig_size) != 0);
 		}
 	}
 
 	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
 			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
 
 			ddt_exit(ddt);
 
 			/*
 			 * Intuitively, it would make more sense to compare
 			 * io_abd than io_orig_abd in the raw case since you
 			 * don't want to look at any transformations that have
 			 * happened to the data. However, for raw I/Os the
 			 * data will actually be the same in io_abd and
 			 * io_orig_abd, so all we have to do is issue this as
 			 * a raw ARC read.
 			 */
 			if (do_raw) {
 				zio_flags |= ZIO_FLAG_RAW;
 				ASSERT3U(zio->io_size, ==, zio->io_orig_size);
 				ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
 				    zio->io_size));
 				ASSERT3P(zio->io_transform_stack, ==, NULL);
 			}
 
 			error = arc_read(NULL, spa, &blk,
 			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
 			    zio_flags, &aflags, &zio->io_bookmark);
 
 			if (error == 0) {
 				if (arc_buf_size(abuf) != zio->io_orig_size ||
 				    abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
 				    zio->io_orig_size) != 0)
 					error = SET_ERROR(EEXIST);
 				arc_buf_destroy(abuf, &abuf);
 			}
 
 			ddt_enter(ddt);
 			return (error != 0);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
 	zio_link_t *zl = NULL;
 	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
 	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
 		while (zio_walk_parents(zio, &zl) != NULL)
 			ddt_phys_addref(ddp);
 	} else {
 		ddt_phys_clear(ddp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static void
 zio_ddt_ditto_write_done(zio_t *zio)
 {
 	int p = DDT_PHYS_DITTO;
 	zio_prop_t *zp = &zio->io_prop;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(zio->io_spa, bp);
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 	ddt_key_t *ddk = &dde->dde_key;
 
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
 	ASSERT(dde->dde_lead_zio[p] == zio);
 	dde->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
 		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
 		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
 		if (ddp->ddp_phys_birth != 0)
 			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
 		ddt_phys_fill(ddp, bp);
 	}
 
 	ddt_exit(ddt);
 }
 
 static int
 zio_ddt_write(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
 	int p = zp->zp_copies;
 	int ditto_copies;
 	zio_t *cio = NULL;
 	zio_t *dio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
 	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
 		 * and try again.  If we're already using a strong checksum,
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
 		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
 		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
 			zp->zp_dedup = B_FALSE;
 			BP_SET_DEDUP(bp, B_FALSE);
 		}
 		ASSERT(!BP_GET_DEDUP(bp));
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
 	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
 
 	if (ditto_copies > ddt_ditto_copies_present(dde) &&
 	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
 		zio_prop_t czp = *zp;
 
 		czp.zp_copies = ditto_copies;
 
 		/*
 		 * If we arrived here with an override bp, we won't have run
 		 * the transform stack, so we won't have the data we need to
 		 * generate a child i/o.  So, toss the override bp and restart.
 		 * This is safe, because using the override bp is just an
 		 * optimization; and it's rare, so the cost doesn't matter.
 		 */
 		if (zio->io_bp_override) {
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
 			zio->io_pipeline = ZIO_WRITE_PIPELINE;
 			zio->io_bp_override = NULL;
 			BP_ZERO(bp);
 			ddt_exit(ddt);
 			return (ZIO_PIPELINE_CONTINUE);
 		}
 
 		dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
 		    NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
 	}
 
 	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
 		if (dde->dde_lead_zio[p] != NULL)
 			zio_add_child(zio, dde->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
 		ASSERT(bp->blk_birth == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
 	} else {
 		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
 		    zio->io_orig_size, zio->io_orig_size, zp,
 		    zio_ddt_child_write_ready, NULL, NULL,
 		    zio_ddt_child_write_done, dde, zio->io_priority,
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
 		dde->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);
 
 	if (cio)
 		zio_nowait(cio);
 	if (dio)
 		zio_nowait(dio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 ddt_entry_t *freedde; /* for debugging */
 
 static int
 zio_ddt_free(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
 	ddp = ddt_phys_select(dde, bp);
 	ddt_phys_decref(ddp);
 	ddt_exit(ddt);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * ==========================================================================
  * Allocate and free blocks
  * ==========================================================================
  */
 
 static zio_t *
 zio_io_to_allocate(spa_t *spa)
 {
 	zio_t *zio;
 
 	ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
 
 	zio = avl_first(&spa->spa_alloc_tree);
 	if (zio == NULL)
 		return (NULL);
 
 	ASSERT(IO_IS_ALLOCATING(zio));
 
 	/*
 	 * Try to place a reservation for this zio. If we're unable to
 	 * reserve then we throttle.
 	 */
 	if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
 	    zio->io_prop.zp_copies, zio, 0)) {
 		return (NULL);
 	}
 
 	avl_remove(&spa->spa_alloc_tree, zio);
 	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
 
 	return (zio);
 }
 
 static int
 zio_dva_throttle(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *nio;
 
 	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
 	    !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
 	    zio->io_child_type == ZIO_CHILD_GANG ||
 	    zio->io_flags & ZIO_FLAG_NODATA) {
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	ASSERT3U(zio->io_queued_timestamp, >, 0);
 	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 
 	mutex_enter(&spa->spa_alloc_lock);
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	avl_add(&spa->spa_alloc_tree, zio);
 
 	nio = zio_io_to_allocate(zio->io_spa);
 	mutex_exit(&spa->spa_alloc_lock);
 
 	if (nio == zio)
 		return (ZIO_PIPELINE_CONTINUE);
 
 	if (nio != NULL) {
 		ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
 		/*
 		 * We are passing control to a new zio so make sure that
 		 * it is processed by a different thread. We do this to
 		 * avoid stack overflows that can occur when parents are
 		 * throttled and children are making progress. We allow
 		 * it to go to the head of the taskq since it's already
 		 * been waiting.
 		 */
 		zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
 	}
 	return (ZIO_PIPELINE_STOP);
 }
 
 void
 zio_allocate_dispatch(spa_t *spa)
 {
 	zio_t *zio;
 
 	mutex_enter(&spa->spa_alloc_lock);
 	zio = zio_io_to_allocate(spa);
 	mutex_exit(&spa->spa_alloc_lock);
 	if (zio == NULL)
 		return;
 
 	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
 	ASSERT0(zio->io_error);
 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
 }
 
 static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
 	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 		zio->io_gang_leader = zio;
 	}
 
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT0(BP_GET_NDVAS(bp));
 	ASSERT3U(zio->io_prop.zp_copies, >, 0);
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		flags |= METASLAB_DONT_THROTTLE;
 	}
 	if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
 		flags |= METASLAB_GANG_CHILD;
 	}
 	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
 		flags |= METASLAB_ASYNC_ALLOC;
 	}
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
 	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
 	    &zio->io_alloc_list, zio);
 
 	if (error != 0) {
 		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
 		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
 		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_free(zio_t *zio)
 {
 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_dva_claim(zio_t *zio)
 {
 	int error;
 
 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 	if (error)
 		zio->io_error = error;
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Undo an allocation.  This is used by zio_done() when an I/O fails
  * and we want to give back the block we just allocated.
  * This handles both normal blocks and gang blocks.
  */
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
 		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			zio_dva_unallocate(zio, gn->gn_child[g],
 			    &gn->gn_gbh->zg_blkptr[g]);
 		}
 	}
 }
 
 /*
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
     uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 	zio_alloc_list_t io_alloc_list;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
 	metaslab_trace_init(&io_alloc_list);
 	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
 	    txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL);
 	if (error == 0) {
 		*slog = TRUE;
 	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID,
 		    &io_alloc_list, NULL);
 		if (error == 0)
 			*slog = FALSE;
 	}
 	metaslab_trace_fini(&io_alloc_list);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
 		BP_SET_CHECKSUM(new_bp,
 		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
 		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
 		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 	} else {
 		zfs_dbgmsg("%s: zil block allocation failure: "
 		    "size %llu, error %d", spa_name(spa), size, error);
 	}
 
 	return (error);
 }
 
 /*
  * Free an intent log block.
  */
 void
 zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
 	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
 	ASSERT(!BP_IS_GANG(bp));
 
 	zio_free(spa, txg, bp);
 }
 
 /*
  * ==========================================================================
  * Read, write and delete to physical devices
  * ==========================================================================
  */
 
 
 /*
  * Issue an I/O to the underlying vdev. Typically the issue pipeline
  * stops after this stage and will resume upon I/O completion.
  * However, there are instances where the vdev layer may need to
  * continue the pipeline when an I/O was not issued. Since the I/O
  * that was sent to the vdev layer might be different than the one
  * currently active in the pipeline (see vdev_queue_io()), we explicitly
  * force the underlying vdev layers to call either zio_execute() or
  * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
  */
 static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	uint64_t align;
 	spa_t *spa = zio->io_spa;
 	int ret;
 
 	ASSERT(zio->io_error == 0);
 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
 
 	if (vd == NULL) {
 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
 
 		/*
 		 * The mirror_ops handle multiple DVAs in a single BP.
 		 */
 		vdev_mirror_ops.vdev_op_io_start(zio);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
 	    zio->io_priority == ZIO_PRIORITY_NOW) {
 		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	ASSERT3P(zio->io_logical, !=, zio);
 
 	/*
 	 * We keep track of time-sensitive I/Os so that the scan thread
 	 * can quickly react to certain workloads.  In particular, we care
 	 * about non-scrubbing, top-level reads and writes with the following
 	 * characteristics:
 	 *	- synchronous writes of user data to non-slog devices
 	 *	- any reads of user data
 	 * When these conditions are met, adjust the timestamp of spa_last_io
 	 * which allows the scan thread to adjust its workload accordingly.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
 	    vd == vd->vdev_top && !vd->vdev_islog &&
 	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
 	    zio->io_txg != spa_syncing_txg(spa)) {
 		uint64_t old = spa->spa_last_io;
 		uint64_t new = ddi_get_lbolt64();
 		if (old != new)
 			(void) atomic_cas_64(&spa->spa_last_io, old, new);
 	}
 
 	align = 1ULL << vd->vdev_top->vdev_ashift;
 
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
 	    P2PHASE(zio->io_size, align) != 0) {
 		/* Transform logical writes to be a full physical block size. */
 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
 		abd_t *abuf = NULL;
 		if (zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_WRITE)
 			abuf = abd_alloc_sametype(zio->io_abd, asize);
 		ASSERT(vd == vd->vdev_top);
 		if (zio->io_type == ZIO_TYPE_WRITE) {
 			abd_copy(abuf, zio->io_abd, zio->io_size);
 			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
 		}
 		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
 		    zio_subblock);
 	}
 
 	/*
 	 * If this is not a physical io, make sure that it is properly aligned
 	 * before proceeding.
 	 */
 	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
 		ASSERT0(P2PHASE(zio->io_offset, align));
 		ASSERT0(P2PHASE(zio->io_size, align));
 	} else {
 		/*
 		 * For the physical io we allow alignment
 		 * to a logical block size.
 		 */
 		uint64_t log_align =
 		    1ULL << vd->vdev_top->vdev_logical_ashift;
 		ASSERT0(P2PHASE(zio->io_offset, log_align));
 		ASSERT0(P2PHASE(zio->io_size, log_align));
 	}
 
 	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
 
 	/*
 	 * If this is a repair I/O, and there's no self-healing involved --
 	 * that is, we're just resilvering what we expect to resilver --
 	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
 	 * This prevents spurious resilvering with nested replication.
 	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
 	 * A is out of date, we'll read from C+D, then use the data to
 	 * resilver A+B -- but we don't actually want to resilver B, just A.
 	 * The top-level mirror has no way to know this, so instead we just
 	 * discard unnecessary repairs as we work our way down the vdev tree.
 	 * The same logic applies to any form of nested replication:
 	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		zio_vdev_io_bypass(zio);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		switch (zio->io_type) {
 		case ZIO_TYPE_READ:
 			if (vdev_cache_read(zio))
 				return (ZIO_PIPELINE_CONTINUE);
 			/* FALLTHROUGH */
 		case ZIO_TYPE_WRITE:
 		case ZIO_TYPE_FREE:
 			if ((zio = vdev_queue_io(zio)) == NULL)
 				return (ZIO_PIPELINE_STOP);
 
 			if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 				zio_interrupt(zio);
 				return (ZIO_PIPELINE_STOP);
 			}
 			break;
 		}
 		/*
 		 * Note that we ignore repair writes for TRIM because they can
 		 * conflict with normal writes. This isn't an issue because, by
 		 * definition, we only repair blocks that aren't freed.
 		 */
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
 		    !trim_map_write_start(zio))
 			return (ZIO_PIPELINE_STOP);
 	}
 
 	vd->vdev_ops->vdev_op_io_start(zio);
 	return (ZIO_PIPELINE_STOP);
 }
 
 static int
 zio_vdev_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
 	boolean_t unexpected_error = B_FALSE;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ ||
 	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
 	    zio->io_type == ZIO_TYPE_FREE)) {
 
 		if (zio->io_type == ZIO_TYPE_WRITE &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
 			trim_map_write_done(zio);
 
 		vdev_queue_io_done(zio);
 
 		if (zio->io_type == ZIO_TYPE_WRITE)
 			vdev_cache_write(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_device_injection(vd,
 			    zio, EIO);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
 
 		if (zio->io_error) {
 			if (zio->io_error == ENOTSUP &&
 			    zio->io_type == ZIO_TYPE_FREE) {
 				/* Not all devices support TRIM. */
 			} else if (!vdev_accessible(vd, zio)) {
 				zio->io_error = SET_ERROR(ENXIO);
 			} else {
 				unexpected_error = B_TRUE;
 			}
 		}
 	}
 
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
 		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * For non-raidz ZIOs, we can just copy aside the bad data read from the
  * disk, and use that to finish the checksum ereport later.
  */
 static void
 zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
     const void *good_buf)
 {
 	/* no processing needed */
 	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
 }
 
 /*ARGSUSED*/
 void
 zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
 {
 	void *buf = zio_buf_alloc(zio->io_size);
 
 	abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
 
 	zcr->zcr_cbinfo = zio->io_size;
 	zcr->zcr_cbdata = buf;
 	zcr->zcr_finish = zio_vsd_default_cksum_finish;
 	zcr->zcr_free = zio_buf_free;
 }
 
 static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
 		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
 	if (zio_injection_enabled && zio->io_error == 0)
 		zio->io_error = zio_handle_fault_injection(zio, EIO);
 
 	if (zio->io_type == ZIO_TYPE_FREE &&
 	    zio->io_priority != ZIO_PRIORITY_NOW) {
 		switch (zio->io_error) {
 		case 0:
 			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
 			ZIO_TRIM_STAT_BUMP(success);
 			break;
 		case EOPNOTSUPP:
 			ZIO_TRIM_STAT_BUMP(unsupported);
 			break;
 		default:
 			ZIO_TRIM_STAT_BUMP(failed);
 			break;
 		}
 	}
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
 	 *
 	 * On retry, we cut in line in the issue queue, since we don't want
 	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
 		    zio_requeue_io_start_cut_in_line);
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	/*
 	 * If we got an error on a leaf device, convert it to ENXIO
 	 * if the device is not accessible at all.
 	 */
 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    !vdev_accessible(vd, zio))
 		zio->io_error = SET_ERROR(ENXIO);
 
 	/*
 	 * If we can't write to an interior vdev (mirror or RAID-Z),
 	 * set vdev_cant_write so that we stop trying to allocate from it.
 	 */
 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
 		vd->vdev_cant_write = B_TRUE;
 	}
 
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent bit so
 	 * that we don't bother with it in the future.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
 	    zio->io_type == ZIO_TYPE_IOCTL &&
 	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
 		vd->vdev_nowritecache = B_TRUE;
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
 	    zio->io_physdone != NULL) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
 		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
 		zio->io_physdone(zio->io_logical);
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
 zio_vdev_io_reissue(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
 	zio->io_stage >>= 1;
 }
 
 void
 zio_vdev_io_bypass(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
  * ==========================================================================
  * Generate and verify checksums
  * ==========================================================================
  */
 static int
 zio_checksum_generate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	enum zio_checksum checksum;
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_write_phys().
 		 * We're either generating a label checksum, or none at all.
 		 */
 		checksum = zio->io_prop.zp_checksum;
 
 		if (checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
 	} else {
 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
 			ASSERT(!IO_IS_ALLOCATING(zio));
 			checksum = ZIO_CHECKSUM_GANG_HEADER;
 		} else {
 			checksum = BP_GET_CHECKSUM(bp);
 		}
 	}
 
 	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_checksum_verify(zio_t *zio)
 {
 	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
 	ASSERT(zio->io_vd != NULL);
 
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
 		 * We're either verifying a label checksum, or nothing at all.
 		 */
 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
 			return (ZIO_PIPELINE_CONTINUE);
 
 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
 	}
 
 	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (error == ECKSUM &&
 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 			zfs_ereport_start_checksum(zio->io_spa,
 			    zio->io_vd, zio, zio->io_offset,
 			    zio->io_size, NULL, &info);
 		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Called by RAID-Z to ensure we don't compute the checksum twice.
  */
 void
 zio_checksum_verified(zio_t *zio)
 {
 	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
  * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
  * ==========================================================================
  */
 int
 zio_worst_error(int e1, int e2)
 {
 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
 	int r1, r2;
 
 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
 		if (e1 == zio_error_rank[r1])
 			break;
 
 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
 		if (e2 == zio_error_rank[r2])
 			break;
 
 	return (r1 > r2 ? e1 : e2);
 }
 
 /*
  * ==========================================================================
  * I/O completion
  * ==========================================================================
  */
 static int
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	zio_t *pio, *pio_next;
 	zio_link_t *zl = NULL;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
 	    ZIO_WAIT_READY)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
 		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
 	}
 
 	if (bp != NULL && bp != &zio->io_bp_copy)
 		zio->io_bp_copy = *bp;
 
 	if (zio->io_error != 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
 		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 			ASSERT(IO_IS_ALLOCATING(zio));
 			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 			/*
 			 * We were unable to allocate anything, unreserve and
 			 * issue the next I/O to allocate.
 			 */
 			metaslab_class_throttle_unreserve(
 			    spa_normal_class(zio->io_spa),
 			    zio->io_prop.zp_copies, zio);
 			zio_allocate_dispatch(zio->io_spa);
 		}
 	}
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_READY] = 1;
 	pio = zio_walk_parents(zio, &zl);
 	mutex_exit(&zio->io_lock);
 
 	/*
 	 * As we notify zio's parents, new parents could be added.
 	 * New parents go to the head of zio's io_parent_list, however,
 	 * so we will (correctly) not notify them.  The remainder of zio's
 	 * io_parent_list, from 'pio_next' onward, cannot change because
 	 * all parents must wait for us to be done before they can be done.
 	 */
 	for (; pio != NULL; pio = pio_next) {
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
 	}
 
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		if (BP_IS_GANG(bp)) {
 			zio->io_flags &= ~ZIO_FLAG_NODATA;
 		} else {
 			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		}
 	}
 
 	if (zio_injection_enabled &&
 	    zio->io_spa->spa_syncing_txg == zio->io_txg)
 		zio_handle_ignored_writes(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
  * Update the allocation throttle accounting.
  */
 static void
 zio_dva_throttle_done(zio_t *zio)
 {
 	zio_t *lio = zio->io_logical;
 	zio_t *pio = zio_unique_parent(zio);
 	vdev_t *vd = zio->io_vd;
 	int flags = METASLAB_ASYNC_ALLOC;
 
 	ASSERT3P(zio->io_bp, !=, NULL);
 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
 	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
 	ASSERT(vd != NULL);
 	ASSERT3P(vd, ==, vd->vdev_top);
 	ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
 	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
 	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
 	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
 
 	/*
 	 * Parents of gang children can have two flavors -- ones that
 	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
 	 * and ones that allocated the constituent blocks. The allocation
 	 * throttle needs to know the allocating parent zio so we must find
 	 * it here.
 	 */
 	if (pio->io_child_type == ZIO_CHILD_GANG) {
 		/*
 		 * If our parent is a rewrite gang child then our grandparent
 		 * would have been the one that performed the allocation.
 		 */
 		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
 			pio = zio_unique_parent(pio);
 		flags |= METASLAB_GANG_CHILD;
 	}
 
 	ASSERT(IO_IS_ALLOCATING(pio));
 	ASSERT3P(zio, !=, zio->io_logical);
 	ASSERT(zio->io_logical != NULL);
 	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
 	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
 
 	mutex_enter(&pio->io_lock);
 	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
 	mutex_exit(&pio->io_lock);
 
 	metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
 	    1, pio);
 
 	/*
 	 * Call into the pipeline to see if there is more work that
 	 * needs to be done. If there is work to be done it will be
 	 * dispatched to another taskq thread.
 	 */
 	zio_allocate_dispatch(zio->io_spa);
 }
 
 static int
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
 	zio_t *pio, *pio_next;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	zio_link_t *zl = NULL;
 
 	/*
 	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, then update the accounting.
 	 * We only track child I/Os that are part of an allocating async
 	 * write. We must do this since the allocation is performed
 	 * by the logical I/O but the actual write is done by child I/Os.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
 	    zio->io_child_type == ZIO_CHILD_VDEV) {
 		ASSERT(mc->mc_alloc_throttle_enabled);
 		zio_dva_throttle_done(zio);
 	}
 
 	/*
 	 * If the allocation throttle is enabled, verify that
 	 * we have decremented the refcounts for every I/O that was throttled.
 	 */
 	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
 		ASSERT(bp != NULL);
 		metaslab_group_alloc_verify(spa, zio->io_bp, zio);
 		VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
 	}
 
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 			ASSERT(zio->io_children[c][w] == 0);
 
 	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
 		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
 		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
 			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
 		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
 			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
 	}
 
 	/*
 	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
 	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
 
 	/*
 	 * If the I/O on the transformed data was successful, generate any
 	 * checksum reports now while we still have the transformed data.
 	 */
 	if (zio->io_error == 0) {
 		while (zio->io_cksum_report != NULL) {
 			zio_cksum_report_t *zcr = zio->io_cksum_report;
 			uint64_t align = zcr->zcr_align;
 			uint64_t asize = P2ROUNDUP(psize, align);
 			char *abuf = NULL;
 			abd_t *adata = zio->io_abd;
 
 			if (asize != psize) {
 				adata = abd_alloc_linear(asize, B_TRUE);
 				abd_copy(adata, zio->io_abd, psize);
 				abd_zero_off(adata, psize, asize - psize);
 			}
 
 			if (adata != NULL)
 				abuf = abd_borrow_buf_copy(adata, asize);
 
 			zio->io_cksum_report = zcr->zcr_next;
 			zcr->zcr_next = NULL;
 			zcr->zcr_finish(zcr, abuf);
 			zfs_ereport_free_checksum(zcr);
 
 			if (adata != NULL)
 				abd_return_buf(adata, abuf, asize);
 
 			if (asize != psize)
 				abd_free(adata);
 		}
 	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
 	vdev_stat_update(zio, psize);
 
 	if (zio->io_error) {
 		/*
 		 * If this I/O is attached to a particular vdev,
 		 * generate an error message describing the I/O failure
 		 * at the block level.  We ignore these errors if the
 		 * device is currently unavailable.
 		 */
 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
 		if ((zio->io_error == EIO || !(zio->io_flags &
 		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
 		    zio == lio) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(spa, zio);
 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
 			    0, 0);
 		}
 	}
 
 	if (zio->io_error && zio == lio) {
 		/*
 		 * Determine whether zio should be reexecuted.  This will
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(vd == NULL && bp != NULL);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		if (IO_IS_ALLOCATING(zio) &&
 		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
 		    zio->io_error == ENXIO &&
 		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		/*
 		 * Here is a possibly good place to attempt to do
 		 * either combinatorial reconstruction or error correction
 		 * based on checksums.  It also might be a good place
 		 * to send out preliminary ereports before we suspend
 		 * processing.
 		 */
 	}
 
 	/*
 	 * If there were logical child errors, they apply to us now.
 	 * We defer this until now to avoid conflating logical child
 	 * errors with errors that happened to the zio itself when
 	 * updating vdev stats and reporting FMA events above.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
 	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
 	/*
 	 * Godfather I/Os should never suspend.
 	 */
 	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
 	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
 		zio->io_reexecute = 0;
 
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
 		 *
 		 * Reexecute is top-down.  When an i/o fails, if it's not
 		 * the root, it simply notifies its parent and sticks around.
 		 * The parent, seeing that it still has children in zio_done(),
 		 * does the same.  This percolates all the way up to the root.
 		 * The root i/o will reexecute or suspend the entire tree.
 		 *
 		 * This approach ensures that zio_reexecute() honors
 		 * all the original i/o dependency relationships, e.g.
 		 * parents not executing until children are ready.
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 		zio->io_gang_leader = NULL;
 
 		mutex_enter(&zio->io_lock);
 		zio->io_state[ZIO_WAIT_DONE] = 1;
 		mutex_exit(&zio->io_lock);
 
 		/*
 		 * "The Godfather" I/O monitors its children but is
 		 * not a true parent to them. It will track them through
 		 * the pipeline but severs its ties whenever they get into
 		 * trouble (e.g. suspended). This allows "The Godfather"
 		 * I/O to return status without blocking.
 		 */
 		zl = NULL;
 		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
 		    pio = pio_next) {
 			zio_link_t *remove_zl = zl;
 			pio_next = zio_walk_parents(zio, &zl);
 
 			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
 			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
 				zio_remove_child(pio, zio, remove_zl);
 				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 			}
 		}
 
 		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
 			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
 			/*
 			 * We'd fail again if we reexecuted now, so suspend
 			 * until conditions improve (e.g. device comes online).
 			 */
 			zio_suspend(spa, zio);
 		} else {
 			/*
 			 * Reexecution is potentially a huge amount of work.
 			 * Hand it off to the otherwise-unused claim taskq.
 			 */
 #if defined(illumos) || !defined(_KERNEL)
 			ASSERT(zio->io_tqent.tqent_next == NULL);
 #else
 			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
 #endif
 			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
 			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
 			    0, &zio->io_tqent);
 		}
 		return (ZIO_PIPELINE_STOP);
 	}
 
 	ASSERT(zio->io_child_count == 0);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
 	/*
 	 * Report any checksum errors, since the I/O is complete.
 	 */
 	while (zio->io_cksum_report != NULL) {
 		zio_cksum_report_t *zcr = zio->io_cksum_report;
 		zio->io_cksum_report = zcr->zcr_next;
 		zcr->zcr_next = NULL;
 		zcr->zcr_finish(zcr, NULL);
 		zfs_ereport_free_checksum(zcr);
 	}
 
 	/*
 	 * It is the responsibility of the done callback to ensure that this
 	 * particular zio is no longer discoverable for adoption, and as
 	 * such, cannot acquire any new parents.
 	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	mutex_enter(&zio->io_lock);
 	zio->io_state[ZIO_WAIT_DONE] = 1;
 	mutex_exit(&zio->io_lock);
 
 	zl = NULL;
 	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
 		zio_link_t *remove_zl = zl;
 		pio_next = zio_walk_parents(zio, &zl);
 		zio_remove_child(pio, zio, remove_zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
 	if (zio->io_waiter != NULL) {
 		mutex_enter(&zio->io_lock);
 		zio->io_executor = NULL;
 		cv_broadcast(&zio->io_cv);
 		mutex_exit(&zio->io_lock);
 	} else {
 		zio_destroy(zio);
 	}
 
 	return (ZIO_PIPELINE_STOP);
 }
 
 /*
  * ==========================================================================
  * I/O pipeline definition
  * ==========================================================================
  */
 static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
 	zio_read_bp_init,
 	zio_write_bp_init,
 	zio_free_bp_init,
 	zio_issue_async,
 	zio_write_compress,
 	zio_checksum_generate,
 	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,
 	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_throttle,
 	zio_dva_allocate,
 	zio_dva_free,
 	zio_dva_claim,
 	zio_ready,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
 	zio_checksum_verify,
 	zio_done
 };
 
 
 
 
 /*
  * Compare two zbookmark_phys_t's to see which we would reach first in a
  * pre-order traversal of the object tree.
  *
  * This is simple in every case aside from the meta-dnode object. For all other
  * objects, we traverse them in order (object 1 before object 2, and so on).
  * However, all of these objects are traversed while traversing object 0, since
  * the data it points to is the list of objects.  Thus, we need to convert to a
  * canonical representation so we can compare meta-dnode bookmarks to
  * non-meta-dnode bookmarks.
  *
  * We do this by calculating "equivalents" for each field of the zbookmark.
  * zbookmarks outside of the meta-dnode use their own object and level, and
  * calculate the level 0 equivalent (the first L0 blkid that is contained in the
  * blocks this bookmark refers to) by multiplying their blkid by their span
  * (the number of L0 blocks contained within one block at their level).
  * zbookmarks inside the meta-dnode calculate their object equivalent
  * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
  * level + 1<<31 (any value larger than a level could ever be) for their level.
  * This causes them to always compare before a bookmark in their object
  * equivalent, compare appropriately to bookmarks in other objects, and to
  * compare appropriately to other bookmarks in the meta-dnode.
  */
 int
 zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
     const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
 {
 	/*
 	 * These variables represent the "equivalent" values for the zbookmark,
 	 * after converting zbookmarks inside the meta dnode to their
 	 * normal-object equivalents.
 	 */
 	uint64_t zb1obj, zb2obj;
 	uint64_t zb1L0, zb2L0;
 	uint64_t zb1level, zb2level;
 
 	if (zb1->zb_object == zb2->zb_object &&
 	    zb1->zb_level == zb2->zb_level &&
 	    zb1->zb_blkid == zb2->zb_blkid)
 		return (0);
 
 	/*
 	 * BP_SPANB calculates the span in blocks.
 	 */
 	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
 	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
 
 	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb1L0 = 0;
 		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb1obj = zb1->zb_object;
 		zb1level = zb1->zb_level;
 	}
 
 	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
 		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
 		zb2L0 = 0;
 		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
 	} else {
 		zb2obj = zb2->zb_object;
 		zb2level = zb2->zb_level;
 	}
 
 	/* Now that we have a canonical representation, do the comparison. */
 	if (zb1obj != zb2obj)
 		return (zb1obj < zb2obj ? -1 : 1);
 	else if (zb1L0 != zb2L0)
 		return (zb1L0 < zb2L0 ? -1 : 1);
 	else if (zb1level != zb2level)
 		return (zb1level > zb2level ? -1 : 1);
 	/*
 	 * This can (theoretically) happen if the bookmarks have the same object
 	 * and level, but different blkids, if the block sizes are not the same.
 	 * There is presently no way to change the indirect block sizes
 	 */
 	return (0);
 }
 
 /*
  *  This function checks the following: given that last_block is the place that
  *  our traversal stopped last time, does that guarantee that we've visited
  *  every node under subtree_root?  Therefore, we can't just use the raw output
  *  of zbookmark_compare.  We have to pass in a modified version of
  *  subtree_root; by incrementing the block id, and then checking whether
  *  last_block is before or equal to that, we can tell whether or not having
  *  visited last_block implies that all of subtree_root's children have been
  *  visited.
  */
 boolean_t
 zbookmark_subtree_completed(const dnode_phys_t *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
 {
 	zbookmark_phys_t mod_zb = *subtree_root;
 	mod_zb.zb_blkid++;
 	ASSERT(last_block->zb_level == 0);
 
 	/* The objset_phys_t isn't before anything. */
 	if (dnp == NULL)
 		return (B_FALSE);
 
 	/*
 	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
 	 * data block size in sectors, because that variable is only used if
 	 * the bookmark refers to a block in the meta-dnode.  Since we don't
 	 * know without examining it what object it refers to, and there's no
 	 * harm in passing in this value in other cases, we always pass it in.
 	 *
 	 * We pass in 0 for the indirect block size shift because zb2 must be
 	 * level 0.  The indirect block size is only used to calculate the span
 	 * of the bookmark, but since the bookmark must be level 0, the span is
 	 * always 1, so the math works out.
 	 *
 	 * If you make changes to how the zbookmark_compare code works, be sure
 	 * to make sure that this code still works afterwards.
 	 */
 	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
 	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
 	    last_block) <= 0);
 }
Index: head/sys/cddl/contrib/opensolaris
===================================================================
--- head/sys/cddl/contrib/opensolaris	(revision 329627)
+++ head/sys/cddl/contrib/opensolaris	(revision 329628)

Property changes on: head/sys/cddl/contrib/opensolaris
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor-sys/illumos/dist:r316910